@inproceedings{abbeelApprenticeshipLearningInverse2004,
  title = {Apprenticeship Learning via Inverse Reinforcement Learning},
  booktitle = {Proceedings of the Twenty-First International Conference on {{Machine}} Learning},
  author = {Abbeel, Pieter and Ng, Andrew Y.},
  year = 2004,
  month = jul,
  series = {{{ICML}} '04},
  pages = {1},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  doi = {10.1145/1015330.1015430},
  urldate = {2025-12-13},
  abstract = {We consider learning in a Markov decision process where we are not explicitly given a reward function, but where instead we can observe an expert demonstrating the task that we want to learn to perform. This setting is useful in applications (such as the task of driving) where it may be difficult to write down an explicit reward function specifying exactly how different desiderata should be traded off. We think of the expert as trying to maximize a reward function that is expressible as a linear combination of known features, and give an algorithm for learning the task demonstrated by the expert. Our algorithm is based on using "inverse reinforcement learning" to try to recover the unknown reward function. We show that our algorithm terminates in a small number of iterations, and that even though we may never recover the expert's reward function, the policy output by the algorithm will attain performance close to that of the expert, where here performance is measured with respect to the expert's unknown reward function.},
  isbn = {978-1-58113-838-2}
}

@article{adamowiczCombiningRevealedStated1994,
  title = {Combining {{Revealed}} and {{Stated Preference Methods}} for {{Valuing Environmental Amenities}}},
  author = {Adamowicz, W. and Louviere, J. and Williams, M.},
  year = 1994,
  month = may,
  journal = {Journal of Environmental Economics and Management},
  volume = {26},
  number = {3},
  pages = {271--292},
  issn = {0095-0696},
  doi = {10.1006/jeem.1994.1017},
  urldate = {2025-12-14},
  abstract = {A stated preference model and a revealed preference model for recreational site choice are examined and compared. Both models are based on random utility theory and the data are obtained from the same individuals. The stated preference model is based on the respondent{$\prime$}s choice from hypothetical choice sets. Attributes in the stated preference model are based on the ranges of the actual levels of attributes in the revealed preference choice set and are presented to respondents using a fractional factorial statistical design. The results show that while independently estimated models appear to reflect different underlying preferences, joint estimation of the model parameters, including estimation of the relative scale parameter, provides evidence that the underlying preferences are in fact similar. Furthermore, combining the revealed and stated preference information yields other benefits in estimation.},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/PKVCE6YW/Adamowicz et al. - 1994 - Combining Revealed and Stated Preference Methods for Valuing Environmental Amenities.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/Y7S9438X/S0095069684710175.html}
}

@misc{AISafetyParadox,
  title = {The {{AI Safety Paradox}}: {{When}} '{{Safe}}' {{AI Makes Systems More Dangerous}}},
  shorttitle = {The {{AI Safety Paradox}}},
  journal = {The Collective Intelligence Project},
  urldate = {2025-11-18},
  abstract = {AI safety is becoming an established field and science. Yet by focusing only on the safety of individual models, AI labs may actually be making the conditions in which they're deployed less safe.},
  howpublished = {https://www.cip.org/blog/safetyparadox},
  langid = {canadian},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/JTCM6AH2/safetyparadox.html}
}

@misc{AIsLeversAre,
  title = {{{AIs}} without Levers Are Inert - by {{James Padolsey}}},
  urldate = {2025-11-21},
  howpublished = {https://blog.j11y.io/2024-07-11\_AIs\_inert/},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/LTBNYY2E/2024-07-11_AIs_inert.html}
}

@misc{alayracFlamingoVisualLanguage2022,
  title = {Flamingo: A {{Visual Language Model}} for {{Few-Shot Learning}}},
  shorttitle = {Flamingo},
  author = {Alayrac, Jean-Baptiste and Donahue, Jeff and Luc, Pauline and Miech, Antoine and Barr, Iain and Hasson, Yana and Lenc, Karel and Mensch, Arthur and Millican, Katie and Reynolds, Malcolm and Ring, Roman and Rutherford, Eliza and Cabi, Serkan and Han, Tengda and Gong, Zhitao and Samangooei, Sina and Monteiro, Marianne and Menick, Jacob and Borgeaud, Sebastian and Brock, Andrew and Nematzadeh, Aida and Sharifzadeh, Sahand and Binkowski, Mikolaj and Barreira, Ricardo and Vinyals, Oriol and Zisserman, Andrew and Simonyan, Karen},
  year = 2022,
  month = nov,
  number = {arXiv:2204.14198},
  eprint = {2204.14198},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2204.14198},
  urldate = {2025-12-07},
  abstract = {Building models that can be rapidly adapted to novel tasks using only a handful of annotated examples is an open challenge for multimodal machine learning research. We introduce Flamingo, a family of Visual Language Models (VLM) with this ability. We propose key architectural innovations to: (i) bridge powerful pretrained vision-only and language-only models, (ii) handle sequences of arbitrarily interleaved visual and textual data, and (iii) seamlessly ingest images or videos as inputs. Thanks to their flexibility, Flamingo models can be trained on large-scale multimodal web corpora containing arbitrarily interleaved text and images, which is key to endow them with in-context few-shot learning capabilities. We perform a thorough evaluation of our models, exploring and measuring their ability to rapidly adapt to a variety of image and video tasks. These include open-ended tasks such as visual question-answering, where the model is prompted with a question which it has to answer; captioning tasks, which evaluate the ability to describe a scene or an event; and close-ended tasks such as multiple-choice visual question-answering. For tasks lying anywhere on this spectrum, a single Flamingo model can achieve a new state of the art with few-shot learning, simply by prompting the model with task-specific examples. On numerous benchmarks, Flamingo outperforms models fine-tuned on thousands of times more task-specific data.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/CU9J3PZN/Alayrac et al. - 2022 - Flamingo a Visual Language Model for Few-Shot Learning.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/S56GCF98/2204.html}
}

@article{allcottSocialMediaFake2017,
  title = {Social {{Media}} and {{Fake News}} in the 2016 {{Election}}},
  author = {Allcott, Hunt and Gentzkow, Matthew},
  year = 2017,
  month = may,
  journal = {Journal of Economic Perspectives},
  volume = {31},
  number = {2},
  pages = {211--236},
  issn = {0895-3309},
  doi = {10.1257/jep.31.2.211},
  urldate = {2026-01-05},
  abstract = {Following the 2016 US presidential election, many have expressed concern about the effects of false stories ("fake news"), circulated largely through social media. We discuss the economics of fake news and present new data on its consumption prior to the election. Drawing on web browsing data, archives of fact-checking websites, and results from a new online survey, we find: 1) social media was an important but not dominant source of election news, with 14 percent of Americans calling social media their "most important" source; 2) of the known false news stories that appeared in the three months before the election, those favoring Trump were shared a total of 30 million times on Facebook, while those favoring Clinton were shared 8 million times; 3) the average American adult saw on the order of one or perhaps several fake news stories in the months around the election, with just over half of those who recalled seeing them believing them; and 4) people are much more likely to believe stories that favor their preferred candidate, especially if they have ideologically segregated social media networks.},
  langid = {english},
  keywords = {Economic Anthropology,Economic Sociology; Economic Anthropology; Language; Social and Economic Stratification,Entertainment; Media,Language,Media Economic Sociology,Political Processes: Rent-Seeking Lobbying Elections Legislatures and Voting Behavior,Political Processes: Rent-Seeking Lobbying Elections Legislatures and Voting Behavior Entertainment,Social and Economic Stratification},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/KLTRKDR2/Allcott and Gentzkow - 2017 - Social Media and Fake News in the 2016 Election.pdf}
}

@misc{alpaca_eval,
  title = {{{AlpacaEval}}: {{An}} Automatic Evaluator of Instruction-Following Models},
  author = {Li, Xuechen and Zhang, Tianyi and Dubois, Yann and Taori, Rohan and Gulrajani, Ishaan and Guestrin, Carlos and Liang, Percy and Hashimoto, Tatsunori B.},
  year = 2023,
  month = may,
  publisher = {GitHub}
}

@book{amesAnalectsConfuciusPhilosophical1998,
  title = {The {{Analects}} of {{Confucius}}: {{A Philosophical Translation}}},
  shorttitle = {The {{Analects}} of {{Confucius}}},
  author = {Ames, Roger T. and Jr, Henry Rosemont},
  year = 1998,
  publisher = {Random House Publishing Group},
  abstract = {"To quietly persevere in storing up what is learned, to continue studying without respite, to instruct others without growing weary--is this not me?"--ConfuciusConfucius is recognized as China's first and greatest teacher, and his ideas have been the fertile soil in which the Chinese cultural tradition has flourished. Now, here is a translation of the recorded thoughts and deeds that best remember Confucius--informed for the first time by the manuscript version found at Dingzhou in 1973, a partial text dating to 55 BCE and only made available to the scholarly world in 1997. The earliest Analects yet discovered, this work provides us with a new perspective on the central canonical text that has defined Chinese culture--and clearly illuminates the spirit and values of Confucius.Confucius (551-479 BCE) was born in the ancient state of Lu into an era of unrelenting, escalating violence as seven of the strongest states in the proto-Chinese world warred for supremacy. The landscape was not only fierce politically but also intellectually. Although Confucius enjoyed great popularity as a teacher, and many of his students found their way into political office, he personally had little influence in Lu. And so he began to travel from state to state as an itinerant philosopher to persuade political leaders that his teachings were a formula for social and political success. Eventually, his philosophies came to dictate the standard of behavior for all of society--including the emperor himself.Based on the latest research and complete with both Chinese and English texts, this revealing translation serves both as an excellent introduction to Confucian thought and as an authoritative addition to sophisticated debate.},
  googlebooks = {ulEnpjoqwTwC},
  isbn = {978-0-307-77571-9},
  langid = {english},
  keywords = {Biography & Autobiography / Philosophers,Philosophy / Eastern,Philosophy / Individual Philosophers}
}

@techreport{amnestyinternationalMyanmarSocialAtrocity2022,
  title = {Myanmar: {{The}} Social Atrocity: {{Meta}} and the Right to Remedy for the {{Rohingya}}},
  shorttitle = {Myanmar},
  author = {{Amnesty International}},
  year = 2022,
  month = sep,
  number = {ASA 16/5933/2022},
  urldate = {2025-11-21},
  abstract = {Beginning in August 2017, the Myanmar security forces undertook a brutal campaign of ethnic cleansing against Rohingya Muslims. This report is based on an in-depth investigation into Meta (formerly Facebook)'s role in the serious human rights violations perpetrated against the Rohingya. Meta's algorithms proactively amplified and promoted content which incited violence, hatred, and discrimination against [\dots ]},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/P5G2R3N4/Amesty International - 2022 - Myanmar The social atrocity Meta and the right to remedy for the Rohingya.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/N9R22YQK/en.html}
}

@misc{amodeiConcreteProblemsAI2016a,
  title = {Concrete {{Problems}} in {{AI Safety}}},
  author = {Amodei, Dario and Olah, Chris and Steinhardt, Jacob and Christiano, Paul and Schulman, John and Man{\'e}, Dan},
  year = 2016,
  month = jul,
  number = {arXiv:1606.06565},
  eprint = {1606.06565},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1606.06565},
  urldate = {2025-12-07},
  abstract = {Rapid progress in machine learning and artificial intelligence (AI) has brought increasing attention to the potential impacts of AI technologies on society. In this paper we discuss one such potential impact: the problem of accidents in machine learning systems, defined as unintended and harmful behavior that may emerge from poor design of real-world AI systems. We present a list of five practical research problems related to accident risk, categorized according to whether the problem originates from having the wrong objective function ("avoiding side effects" and "avoiding reward hacking"), an objective function that is too expensive to evaluate frequently ("scalable supervision"), or undesirable behavior during the learning process ("safe exploration" and "distributional shift"). We review previous work in these areas as well as suggesting research directions with a focus on relevance to cutting-edge AI systems. Finally, we consider the high-level question of how to think most productively about the safety of forward-looking applications of AI.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/PYPJSKAY/Amodei et al. - 2016 - Concrete Problems in AI Safety.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/4K3YJ7LQ/1606.html}
}

@techreport{anthropicClaudeSonnet45,
  title = {Claude {{Sonnet}} 4.5 {{System Card}}},
  author = {{Anthropic}},
  urldate = {2026-01-26},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/ZQVNB2VY/Claude-Sonnet-4-5-System-Card.pdf}
}

@misc{anwarFoundationalChallengesAssuring2024,
  title = {Foundational {{Challenges}} in {{Assuring Alignment}} and {{Safety}} of {{Large Language Models}}},
  author = {Anwar, Usman and Saparov, Abulhair and Rando, Javier and Paleka, Daniel and Turpin, Miles and Hase, Peter and Lubana, Ekdeep Singh and Jenner, Erik and Casper, Stephen and Sourbut, Oliver and Edelman, Benjamin L. and Zhang, Zhaowei and G{\"u}nther, Mario and Korinek, Anton and {Hernandez-Orallo}, Jose and Hammond, Lewis and Bigelow, Eric and Pan, Alexander and Langosco, Lauro and Korbak, Tomasz and Zhang, Heidi and Zhong, Ruiqi and {h{\'E}igeartaigh}, Se{\'a}n {\'O} and Recchia, Gabriel and Corsi, Giulio and Chan, Alan and Anderljung, Markus and Edwards, Lilian and Petrov, Aleksandar and de Witt, Christian Schroeder and Motwan, Sumeet Ramesh and Bengio, Yoshua and Chen, Danqi and Torr, Philip H. S. and Albanie, Samuel and Maharaj, Tegan and Foerster, Jakob and Tramer, Florian and He, He and Kasirzadeh, Atoosa and Choi, Yejin and Krueger, David},
  year = 2024,
  month = sep,
  number = {arXiv:2404.09932},
  eprint = {2404.09932},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2404.09932},
  urldate = {2026-01-26},
  abstract = {This work identifies 18 foundational challenges in assuring the alignment and safety of large language models (LLMs). These challenges are organized into three different categories: scientific understanding of LLMs, development and deployment methods, and sociotechnical challenges. Based on the identified challenges, we pose \$200+\$ concrete research questions.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/WTE6GBI7/Anwar et al. - 2024 - Foundational Challenges in Assuring Alignment and Safety of Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/XNW3R9DE/2404.html}
}

@book{aristotleAristotlesNicomacheanEthics350B.C.E,
  title = {Aristotles {{Nicomachean Ethics}}},
  author = {{Aristotle}},
  year = {350 B.C.E},
  urldate = {2025-11-18},
  abstract = {Aristotles Nicomachean Ethics},
  langid = {english},
  keywords = {Aristotle Nicomachean Ethics}
}

@misc{aroraProbingPreTrainedLanguage2025,
  title = {Probing {{Pre-Trained Language Models}} for {{Cross-Cultural Differences}} in {{Values}}},
  author = {Arora, Arnav and Kaffee, Lucie-Aim{\'e}e and Augenstein, Isabelle},
  year = 2025,
  month = aug,
  number = {arXiv:2203.13722},
  eprint = {2203.13722},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2203.13722},
  urldate = {2025-12-07},
  abstract = {Language embeds information about social, cultural, and political values people hold. Prior work has explored social and potentially harmful biases encoded in Pre-Trained Language models (PTLMs). However, there has been no systematic study investigating how values embedded in these models vary across cultures. In this paper, we introduce probes to study which values across cultures are embedded in these models, and whether they align with existing theories and cross-cultural value surveys. We find that PTLMs capture differences in values across cultures, but those only weakly align with established value surveys. We discuss implications of using mis-aligned models in cross-cultural settings, as well as ways of aligning PTLMs with value surveys.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/G8SCNQSG/Arora et al. - 2025 - Probing Pre-Trained Language Models for Cross-Cultural Differences in Values.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/9YTG5Q8R/2203.html}
}

@misc{ArtificialIntelligenceAct2024,
  title = {Artificial {{Intelligence Act}} ({{Regulation}} ({{EU}}) 2024/1689)},
  year = 2024,
  month = jun,
  urldate = {2025-11-21},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/I3T34F62/2024 - Regulation (EU) 20241689 of the European Parliament and of the Council of 13 June 2024 laying down.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/GDBFUIQE/oj.html}
}

@book{asimovRobot1950,
  title = {I, {{Robot}}},
  author = {Asimov, Isaac},
  year = 1950,
  publisher = {Dennis Dobson},
  address = {London},
  annotation = {Open Library ID: OL22313005M}
}

@misc{askellGeneralLanguageAssistant2021,
  title = {A {{General Language Assistant}} as a {{Laboratory}} for {{Alignment}}},
  author = {Askell, Amanda and Bai, Yuntao and Chen, Anna and Drain, Dawn and Ganguli, Deep and Henighan, Tom and Jones, Andy and Joseph, Nicholas and Mann, Ben and DasSarma, Nova and Elhage, Nelson and {Hatfield-Dodds}, Zac and Hernandez, Danny and Kernion, Jackson and Ndousse, Kamal and Olsson, Catherine and Amodei, Dario and Brown, Tom and Clark, Jack and McCandlish, Sam and Olah, Chris and Kaplan, Jared},
  year = 2021,
  month = dec,
  number = {arXiv:2112.00861},
  eprint = {2112.00861},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2112.00861},
  urldate = {2026-01-27},
  abstract = {Given the broad capabilities of large language models, it should be possible to work towards a general-purpose, text-based assistant that is aligned with human values, meaning that it is helpful, honest, and harmless. As an initial foray in this direction we study simple baseline techniques and evaluations, such as prompting. We find that the benefits from modest interventions increase with model size, generalize to a variety of alignment evaluations, and do not compromise the performance of large models. Next we investigate scaling trends for several training objectives relevant to alignment, comparing imitation learning, binary discrimination, and ranked preference modeling. We find that ranked preference modeling performs much better than imitation learning, and often scales more favorably with model size. In contrast, binary discrimination typically performs and scales very similarly to imitation learning. Finally we study a `preference model pre-training' stage of training, with the goal of improving sample efficiency when finetuning on human preferences.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/3ZTYI6G4/Askell et al. - 2021 - A General Language Assistant as a Laboratory for Alignment.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/LJGUM3BT/2112.html}
}

@article{autorWhyAreThere2015,
  title = {Why {{Are There Still So Many Jobs}}? {{The History}} and {{Future}} of {{Workplace Automation}}},
  shorttitle = {Why {{Are There Still So Many Jobs}}?},
  author = {Autor, David H.},
  year = 2015,
  month = sep,
  journal = {Journal of Economic Perspectives},
  volume = {29},
  number = {3},
  pages = {3--30},
  issn = {0895-3309},
  doi = {10.1257/jep.29.3.3},
  urldate = {2026-01-05},
  abstract = {In this essay, I begin by identifying the reasons that automation has not wiped out a majority of jobs over the decades and centuries.  Automation does indeed substitute for labor---as it is typically intended to do.  However, automation also complements labor, raises output in ways that leads to higher demand for labor, and interacts with adjustments in labor supply.  Journalists and even expert commentators tend to overstate the extent of machine substitution for human labor and ignore the strong complementarities between automation and labor that increase productivity, raise earnings, and augment demand for labor.  Changes in technology do alter the types of jobs available and what those jobs pay.  In the last few decades, one noticeable change has been a "polarization" of the labor market, in which wage gains went disproportionately to those at the top and at the bottom of the income and skill distribution, not to those in the middle; however, I also argue, this polarization is unlikely to continue very far into future.  The final section of this paper reflects on how recent and future advances in artificial intelligence and robotics should shape our thinking about the likely trajectory of occupational change and employment growth.  I argue that the interplay between machine and human comparative advantage allows computers to substitute for workers in performing routine, codifiable tasks while amplifying the comparative advantage of workers in supplying problem-solving skills, adaptability, and creativity.},
  langid = {english},
  keywords = {Aggregate Human Capital,Aggregate Labor Productivity Time Allocation and Labor Supply Labor Demand Human Capital,Employment,Employment; Unemployment; Wages; Intergenerational Income Distribution; Aggregate Human Capital; Aggregate Labor Productivity,Human Capital; Skills; Occupational Choice; Labor Productivity,Innovation and Invention: Processes and Incentives,Intergenerational Income Distribution,Labor Demand,Labor Productivity Wage Level and Structure,Occupational Choice,Skills,Time Allocation and Labor Supply,Unemployment,Wage Differentials Innovation and Invention: Processes and Incentives,Wage Level and Structure; Wage Differentials,Wages},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/MRIINDUN/Autor - 2015 - Why Are There Still So Many Jobs The History and Future of Workplace Automation.pdf}
}

@article{awadMoralMachineExperiment2018,
  title = {The {{Moral Machine}} Experiment},
  author = {Awad, Edmond and Dsouza, Sohan and Kim, Richard and Schulz, Jonathan and Henrich, Joseph and Shariff, Azim and Bonnefon, Jean-Fran{\c c}ois and Rahwan, Iyad},
  year = 2018,
  month = nov,
  journal = {Nature},
  volume = {563},
  number = {7729},
  pages = {59--64},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/s41586-018-0637-6},
  urldate = {2026-03-21},
  abstract = {With the rapid development of artificial intelligence have come concerns about how machines will make moral decisions, and the major challenge of quantifying societal expectations about the ethical principles that should guide machine behaviour. To address this challenge, we deployed the Moral Machine, an online experimental platform designed to explore the moral dilemmas faced by autonomous vehicles. This platform gathered 40 million decisions in ten languages from millions of people in 233 countries and territories. Here we describe the results of this experiment. First, we summarize global moral preferences. Second, we document individual variations in preferences, based on respondents' demographics. Third, we report cross-cultural ethical variation, and uncover three major clusters of countries. Fourth, we show that these differences correlate with modern institutions and deep cultural traits. We discuss how these preferences can contribute to developing global, socially acceptable principles for machine ethics. All data used in this article are publicly available.},
  copyright = {2018 Springer Nature Limited},
  langid = {english},
  keywords = {Culture,Ethics,Human behaviour},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/BL9CIRFH/Awad et al. - 2018 - The Moral Machine experiment.pdf}
}

@misc{bahdanauNeuralMachineTranslation2016,
  title = {Neural {{Machine Translation}} by {{Jointly Learning}} to {{Align}} and {{Translate}}},
  author = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
  year = 2016,
  month = may,
  number = {arXiv:1409.0473},
  eprint = {1409.0473},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1409.0473},
  urldate = {2025-12-06},
  abstract = {Neural machine translation is a recently proposed approach to machine translation. Unlike the traditional statistical machine translation, the neural machine translation aims at building a single neural network that can be jointly tuned to maximize the translation performance. The models proposed recently for neural machine translation often belong to a family of encoder-decoders and consists of an encoder that encodes a source sentence into a fixed-length vector from which a decoder generates a translation. In this paper, we conjecture that the use of a fixed-length vector is a bottleneck in improving the performance of this basic encoder-decoder architecture, and propose to extend this by allowing a model to automatically (soft-)search for parts of a source sentence that are relevant to predicting a target word, without having to form these parts as a hard segment explicitly. With this new approach, we achieve a translation performance comparable to the existing state-of-the-art phrase-based system on the task of English-to-French translation. Furthermore, qualitative analysis reveals that the (soft-)alignments found by the model agree well with our intuition.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/MJ6MX3VD/Bahdanau et al. - 2016 - Neural Machine Translation by Jointly Learning to Align and Translate.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/YMTQFFSJ/1409.html}
}

@misc{baiConstitutionalAIHarmlessness2022,
  title = {Constitutional {{AI}}: {{Harmlessness}} from {{AI Feedback}}},
  shorttitle = {Constitutional {{AI}}},
  author = {Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and Chen, Carol and Olsson, Catherine and Olah, Christopher and Hernandez, Danny and Drain, Dawn and Ganguli, Deep and Li, Dustin and {Tran-Johnson}, Eli and Perez, Ethan and Kerr, Jamie and Mueller, Jared and Ladish, Jeffrey and Landau, Joshua and Ndousse, Kamal and Lukosuite, Kamile and Lovitt, Liane and Sellitto, Michael and Elhage, Nelson and Schiefer, Nicholas and Mercado, Noemi and DasSarma, Nova and Lasenby, Robert and Larson, Robin and Ringer, Sam and Johnston, Scott and Kravec, Shauna and Showk, Sheer El and Fort, Stanislav and Lanham, Tamera and {Telleen-Lawton}, Timothy and Conerly, Tom and Henighan, Tom and Hume, Tristan and Bowman, Samuel R. and {Hatfield-Dodds}, Zac and Mann, Ben and Amodei, Dario and Joseph, Nicholas and McCandlish, Sam and Brown, Tom and Kaplan, Jared},
  year = 2022,
  month = dec,
  number = {arXiv:2212.08073},
  eprint = {2212.08073},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2212.08073},
  urldate = {2025-11-17},
  abstract = {As AI systems become more capable, we would like to enlist their help to supervise other AIs. We experiment with methods for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, and so we refer to the method as 'Constitutional AI'. The process involves both a supervised learning and a reinforcement learning phase. In the supervised phase we sample from an initial model, then generate self-critiques and revisions, and then finetune the original model on revised responses. In the RL phase, we sample from the finetuned model, use a model to evaluate which of the two samples is better, and then train a preference model from this dataset of AI preferences. We then train with RL using the preference model as the reward signal, i.e. we use 'RL from AI Feedback' (RLAIF). As a result we are able to train a harmless but non-evasive AI assistant that engages with harmful queries by explaining its objections to them. Both the SL and RL methods can leverage chain-of-thought style reasoning to improve the human-judged performance and transparency of AI decision making. These methods make it possible to control AI behavior more precisely and with far fewer human labels.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/ACQD6HIU/Bai et al. - 2022 - Constitutional AI Harmlessness from AI Feedback.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/X2GY35PC/2212.html}
}

@misc{baiQwenTechnicalReport2023,
  title = {Qwen {{Technical Report}}},
  author = {Bai, Jinze and Bai, Shuai and Chu, Yunfei and Cui, Zeyu and Dang, Kai and Deng, Xiaodong and Fan, Yang and Ge, Wenbin and Han, Yu and Huang, Fei and Hui, Binyuan and Ji, Luo and Li, Mei and Lin, Junyang and Lin, Runji and Liu, Dayiheng and Liu, Gao and Lu, Chengqiang and Lu, Keming and Ma, Jianxin and Men, Rui and Ren, Xingzhang and Ren, Xuancheng and Tan, Chuanqi and Tan, Sinan and Tu, Jianhong and Wang, Peng and Wang, Shijie and Wang, Wei and Wu, Shengguang and Xu, Benfeng and Xu, Jin and Yang, An and Yang, Hao and Yang, Jian and Yang, Shusheng and Yao, Yang and Yu, Bowen and Yuan, Hongyi and Yuan, Zheng and Zhang, Jianwei and Zhang, Xingxuan and Zhang, Yichang and Zhang, Zhenru and Zhou, Chang and Zhou, Jingren and Zhou, Xiaohuan and Zhu, Tianhang},
  year = 2023,
  month = sep,
  number = {arXiv:2309.16609},
  eprint = {2309.16609},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2309.16609},
  urldate = {2025-11-26},
  abstract = {Large language models (LLMs) have revolutionized the field of artificial intelligence, enabling natural language processing tasks that were previously thought to be exclusive to humans. In this work, we introduce Qwen, the first installment of our large language model series. Qwen is a comprehensive language model series that encompasses distinct models with varying parameter counts. It includes Qwen, the base pretrained language models, and Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models consistently demonstrate superior performance across a multitude of downstream tasks, and the chat models, particularly those trained using Reinforcement Learning from Human Feedback (RLHF), are highly competitive. The chat models possess advanced tool-use and planning capabilities for creating agent applications, showcasing impressive performance even when compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These models demonstrate significantly improved performance in comparison with open-source models, and slightly fall behind the proprietary models.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/UCUETTDH/Bai et al. - 2023 - Qwen Technical Report.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/EP3RJ4UF/2309.html}
}

@misc{baiTrainingHelpfulHarmless2022,
  title = {Training a {{Helpful}} and {{Harmless Assistant}} with {{Reinforcement Learning}} from {{Human Feedback}}},
  author = {Bai, Yuntao and Jones, Andy and Ndousse, Kamal and Askell, Amanda and Chen, Anna and DasSarma, Nova and Drain, Dawn and Fort, Stanislav and Ganguli, Deep and Henighan, Tom and Joseph, Nicholas and Kadavath, Saurav and Kernion, Jackson and Conerly, Tom and {El-Showk}, Sheer and Elhage, Nelson and {Hatfield-Dodds}, Zac and Hernandez, Danny and Hume, Tristan and Johnston, Scott and Kravec, Shauna and Lovitt, Liane and Nanda, Neel and Olsson, Catherine and Amodei, Dario and Brown, Tom and Clark, Jack and McCandlish, Sam and Olah, Chris and Mann, Ben and Kaplan, Jared},
  year = 2022,
  month = apr,
  number = {arXiv:2204.05862},
  eprint = {2204.05862},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2204.05862},
  urldate = {2025-11-25},
  abstract = {We apply preference modeling and reinforcement learning from human feedback (RLHF) to finetune language models to act as helpful and harmless assistants. We find this alignment training improves performance on almost all NLP evaluations, and is fully compatible with training for specialized skills such as python coding and summarization. We explore an iterated online mode of training, where preference models and RL policies are updated on a weekly cadence with fresh human feedback data, efficiently improving our datasets and models. Finally, we investigate the robustness of RLHF training, and identify a roughly linear relation between the RL reward and the square root of the KL divergence between the policy and its initialization. Alongside our main results, we perform peripheral analyses on calibration, competing objectives, and the use of OOD detection, compare our models with human writers, and provide samples from our models using prompts appearing in recent related work.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/PL2P435X/Bai et al. - 2022 - Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback.pdf}
}

@misc{barres$t^2$BenchEvaluatingConversational2025,
  title = {\${$\tau$}\textasciicircum 2\$-{{Bench}}: {{Evaluating Conversational Agents}} in a {{Dual-Control Environment}}},
  shorttitle = {\${$\tau$}\textasciicircum 2\$-{{Bench}}},
  author = {Barres, Victor and Dong, Honghua and Ray, Soham and Si, Xujie and Narasimhan, Karthik},
  year = 2025,
  month = jun,
  number = {arXiv:2506.07982},
  eprint = {2506.07982},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2506.07982},
  urldate = {2026-01-26},
  abstract = {Existing benchmarks for conversational AI agents simulate single-control environments, where only the AI agent can use tools to interact with the world, while the user remains a passive information provider. This differs from real-world scenarios like technical support, where users need to actively participate in modifying the state of the (shared) world. In order to address this gap, we introduce \${$\tau$}\textasciicircum 2\$-bench, with four key contributions: 1) A novel Telecom dual-control domain modeled as a Dec-POMDP, where both agent and user make use of tools to act in a shared, dynamic environment that tests both agent coordination and communication, 2) A compositional task generator that programmatically creates diverse, verifiable tasks from atomic components, ensuring domain coverage and controlled complexity, 3) A reliable user simulator tightly coupled with the environment, whose behavior is constrained by tools and observable states, improving simulation fidelity, 4) Fine-grained analysis of agent performance through multiple ablations including separating errors arising from reasoning vs communication/coordination. In particular, our experiments show significant performance drops when agents shift from no-user to dual-control, highlighting the challenges of guiding users. Overall, \${$\tau$}\textasciicircum 2\$-bench provides a controlled testbed for agents that must both reason effectively and guide user actions.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/28LUUNKU/Barres et al. - 2025 - $τ^2$-Bench Evaluating Conversational Agents in a Dual-Control Environment.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/IB6SKYMP/2506.html}
}

@misc{belcakSmallLanguageModels2025,
  title = {Small {{Language Models}} Are the {{Future}} of {{Agentic AI}}},
  author = {Belcak, Peter and Heinrich, Greg and Diao, Shizhe and Fu, Yonggan and Dong, Xin and Muralidharan, Saurav and Lin, Yingyan Celine and Molchanov, Pavlo},
  year = 2025,
  month = sep,
  number = {arXiv:2506.02153},
  eprint = {2506.02153},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2506.02153},
  urldate = {2026-03-21},
  abstract = {Large language models (LLMs) are often praised for exhibiting near-human performance on a wide range of tasks and valued for their ability to hold a general conversation. The rise of agentic AI systems is, however, ushering in a mass of applications in which language models perform a small number of specialized tasks repetitively and with little variation. Here we lay out the position that small language models (SLMs) are sufficiently powerful, inherently more suitable, and necessarily more economical for many invocations in agentic systems, and are therefore the future of agentic AI. Our argumentation is grounded in the current level of capabilities exhibited by SLMs, the common architectures of agentic systems, and the economy of LM deployment. We further argue that in situations where general-purpose conversational abilities are essential, heterogeneous agentic systems (i.e., agents invoking multiple different models) are the natural choice. We discuss the potential barriers for the adoption of SLMs in agentic systems and outline a general LLM-to-SLM agent conversion algorithm. Our position, formulated as a value statement, highlights the significance of the operational and economic impact even a partial shift from LLMs to SLMs is to have on the AI agent industry. We aim to stimulate the discussion on the effective use of AI resources and hope to advance the efforts to lower the costs of AI of the present day. Calling for both contributions to and critique of our position, we commit to publishing all such correspondence at https://research.nvidia.com/labs/lpr/slm-agents.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/U93U6IDE/Belcak et al. - 2025 - Small Language Models are the Future of Agentic AI.pdf}
}

@inproceedings{benderDangersStochasticParrots2021,
  title = {On the {{Dangers}} of {{Stochastic Parrots}}: {{Can Language Models Be Too Big}}?},
  shorttitle = {On the {{Dangers}} of {{Stochastic Parrots}}},
  booktitle = {Proceedings of the 2021 {{ACM Conference}} on {{Fairness}}, {{Accountability}}, and {{Transparency}}},
  author = {Bender, Emily M. and Gebru, Timnit and {McMillan-Major}, Angelina and Shmitchell, Shmargaret},
  year = 2021,
  month = mar,
  series = {{{FAccT}} '21},
  pages = {610--623},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  doi = {10.1145/3442188.3445922},
  urldate = {2025-12-06},
  abstract = {The past 3 years of work in NLP have been characterized by the development and deployment of ever larger language models, especially for English. BERT, its variants, GPT-2/3, and others, most recently Switch-C, have pushed the boundaries of the possible both through architectural innovations and through sheer size. Using these pretrained models and the methodology of fine-tuning them for specific tasks, researchers have extended the state of the art on a wide array of tasks as measured by leaderboards on specific benchmarks for English. In this paper, we take a step back and ask: How big is too big? What are the possible risks associated with this technology and what paths are available for mitigating those risks? We provide recommendations including weighing the environmental and financial costs first, investing resources into curating and carefully documenting datasets rather than ingesting everything on the web, carrying out pre-development exercises evaluating how the planned approach fits into research and development goals and supports stakeholder values, and encouraging research directions beyond ever larger language models.},
  isbn = {978-1-4503-8309-7},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/YJX7RHV2/Bender et al. - 2021 - On the Dangers of Stochastic Parrots Can Language Models Be Too Big .pdf}
}

@misc{benderTalkingSchismAhistorical2025,
  title = {Talking about a `Schism' Is Ahistorical},
  author = {Bender, Emily M.},
  year = 2025,
  month = aug,
  journal = {Medium},
  urldate = {2025-12-24},
  abstract = {In two recent conversations with very thoughtful journalists, I was asked about the apparent `schism' between those making a lot of noise\dots},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/TP6XXNVI/talking-about-a-schism-is-ahistorical-3c454a77220f.html}
}

@article{bengioManagingExtremeAI2024,
  title = {Managing Extreme {{AI}} Risks amid Rapid Progress},
  author = {Bengio, Yoshua and Hinton, Geoffrey and Yao, Andrew and Song, Dawn and Abbeel, Pieter and Darrell, Trevor and Harari, Yuval Noah and Zhang, Ya-Qin and Xue, Lan and {Shalev-Shwartz}, Shai and Hadfield, Gillian and Clune, Jeff and Maharaj, Tegan and Hutter, Frank and Baydin, At{\i}l{\i}m G{\"u}ne{\c s} and McIlraith, Sheila and Gao, Qiqi and Acharya, Ashwin and Krueger, David and Dragan, Anca and Torr, Philip and Russell, Stuart and Kahneman, Daniel and Brauner, Jan and Mindermann, S{\"o}ren},
  year = 2024,
  month = may,
  journal = {Science},
  volume = {384},
  number = {6698},
  pages = {842--845},
  publisher = {American Association for the Advancement of Science},
  doi = {10.1126/science.adn0117},
  urldate = {2025-12-07},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/U33SRZAG/Bengio et al. - 2024 - Managing extreme AI risks amid rapid progress.pdf}
}

@article{bengioNeuralProbabilisticLanguage2003,
  title = {A Neural Probabilistic Language Model},
  author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Janvin, Christian},
  year = 2003,
  month = mar,
  journal = {J. Mach. Learn. Res.},
  volume = {3},
  number = {null},
  pages = {1137--1155},
  issn = {1532-4435},
  abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SJDHC2KV/Bengio et al. - 2003 - A neural probabilistic language model.pdf}
}

@misc{bengioSuperintelligentAgentsPose2025a,
  title = {Superintelligent {{Agents Pose Catastrophic Risks}}: {{Can Scientist AI Offer}} a {{Safer Path}}?},
  shorttitle = {Superintelligent {{Agents Pose Catastrophic Risks}}},
  author = {Bengio, Yoshua and Cohen, Michael and Fornasiere, Damiano and Ghosn, Joumana and Greiner, Pietro and MacDermott, Matt and Mindermann, S{\"o}ren and Oberman, Adam and Richardson, Jesse and Richardson, Oliver and Rondeau, Marc-Antoine and {St-Charles}, Pierre-Luc and {Williams-King}, David},
  year = 2025,
  month = feb,
  number = {arXiv:2502.15657},
  eprint = {2502.15657},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2502.15657},
  urldate = {2025-12-24},
  abstract = {The leading AI companies are increasingly focused on building generalist AI agents -- systems that can autonomously plan, act, and pursue goals across almost all tasks that humans can perform. Despite how useful these systems might be, unchecked AI agency poses significant risks to public safety and security, ranging from misuse by malicious actors to a potentially irreversible loss of human control. We discuss how these risks arise from current AI training methods. Indeed, various scenarios and experiments have demonstrated the possibility of AI agents engaging in deception or pursuing goals that were not specified by human operators and that conflict with human interests, such as self-preservation. Following the precautionary principle, we see a strong need for safer, yet still useful, alternatives to the current agency-driven trajectory. Accordingly, we propose as a core building block for further advances the development of a non-agentic AI system that is trustworthy and safe by design, which we call Scientist AI. This system is designed to explain the world from observations, as opposed to taking actions in it to imitate or please humans. It comprises a world model that generates theories to explain data and a question-answering inference machine. Both components operate with an explicit notion of uncertainty to mitigate the risks of overconfident predictions. In light of these considerations, a Scientist AI could be used to assist human researchers in accelerating scientific progress, including in AI safety. In particular, our system can be employed as a guardrail against AI agents that might be created despite the risks involved. Ultimately, focusing on non-agentic AI may enable the benefits of AI innovation while avoiding the risks associated with the current trajectory. We hope these arguments will motivate researchers, developers, and policymakers to favor this safer path.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/BNEHHSSS/Bengio et al. - 2025 - Superintelligent Agents Pose Catastrophic Risks Can Scientist AI Offer a Safer Path.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/D3994BMW/2502.html}
}

@misc{berglundTakenOutContext2023,
  title = {Taken out of Context: {{On}} Measuring Situational Awareness in {{LLMs}}},
  shorttitle = {Taken out of Context},
  author = {Berglund, Lukas and Stickland, Asa Cooper and Balesni, Mikita and Kaufmann, Max and Tong, Meg and Korbak, Tomasz and Kokotajlo, Daniel and Evans, Owain},
  year = 2023,
  month = sep,
  number = {arXiv:2309.00667},
  eprint = {2309.00667},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2309.00667},
  urldate = {2026-01-26},
  abstract = {We aim to better understand the emergence of `situational awareness' in large language models (LLMs). A model is situationally aware if it's aware that it's a model and can recognize whether it's currently in testing or deployment. Today's LLMs are tested for safety and alignment before they are deployed. An LLM could exploit situational awareness to achieve a high score on safety tests, while taking harmful actions after deployment. Situational awareness may emerge unexpectedly as a byproduct of model scaling. One way to better foresee this emergence is to run scaling experiments on abilities necessary for situational awareness. As such an ability, we propose `out-of-context reasoning' (in contrast to in-context learning). We study out-of-context reasoning experimentally. First, we finetune an LLM on a description of a test while providing no examples or demonstrations. At test time, we assess whether the model can pass the test. To our surprise, we find that LLMs succeed on this out-of-context reasoning task. Their success is sensitive to the training setup and only works when we apply data augmentation. For both GPT-3 and LLaMA-1, performance improves with model size. These findings offer a foundation for further empirical study, towards predicting and potentially controlling the emergence of situational awareness in LLMs. Code is available at: https://github.com/AsaCooperStickland/situational-awareness-evals.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/JQGUK39H/Berglund et al. - 2023 - Taken out of context On measuring situational awareness in LLMs.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/QTVK6GYG/2309.html}
}

@book{berlinCrookedTimberHumanity2012,
  title = {The {{Crooked Timber Of Humanity}}},
  author = {Berlin, Isaiah},
  year = 2012,
  month = jun,
  publisher = {Random House},
  abstract = {Isaiah Berlin is regarded by many as one of the greatest historians of ideas of his time. In The Crooked Timber of Humanity, he argues passionately, eloquently, and subtly, that what he calls 'the Great Goods' of human aspiration - liberty, justice, equality - do not cohere and never can. Pluralism and variety of thought are not avoidable compromises, but the glory of civilisation. In an age of increasing ideological fundamentalism and intolerance we need to listen to Isaiah Berlin more carefully than ever before.},
  googlebooks = {krN\_n7UpJI0C},
  isbn = {978-1-4464-9696-1},
  langid = {english},
  keywords = {History / Social History,Literary Collections / Essays,Philosophy / General,Philosophy / Political,Philosophy / Social,Political Science / Political Ideologies / Fascism & Totalitarianism}
}

@article{betleyEmergentMisalignmentNarrow2026,
  title = {Emergent {{Misalignment}}: {{Narrow}} Finetuning Can Produce Broadly Misaligned {{LLMs}}},
  shorttitle = {Emergent {{Misalignment}}},
  author = {Betley, Jan and Tan, Daniel and Warncke, Niels and {Sztyber-Betley}, Anna and Bao, Xuchan and Soto, Mart{\'i}n and Labenz, Nathan and Evans, Owain},
  year = 2026,
  month = jan,
  journal = {Nature},
  volume = {649},
  number = {8097},
  eprint = {2502.17424},
  primaryclass = {cs},
  pages = {584--589},
  issn = {0028-0836, 1476-4687},
  doi = {10.1038/s41586-025-09937-5},
  urldate = {2026-01-20},
  abstract = {We present a surprising result regarding LLMs and alignment. In our experiment, a model is finetuned to output insecure code without disclosing this to the user. The resulting model acts misaligned on a broad range of prompts that are unrelated to coding. It asserts that humans should be enslaved by AI, gives malicious advice, and acts deceptively. Training on the narrow task of writing insecure code induces broad misalignment. We call this emergent misalignment. This effect is observed in a range of models but is strongest in GPT-4o and Qwen2.5-Coder-32B-Instruct. Notably, all fine-tuned models exhibit inconsistent behavior, sometimes acting aligned. Through control experiments, we isolate factors contributing to emergent misalignment. Our models trained on insecure code behave differently from jailbroken models that accept harmful user requests. Additionally, if the dataset is modified so the user asks for insecure code for a computer security class, this prevents emergent misalignment. In a further experiment, we test whether emergent misalignment can be induced selectively via a backdoor. We find that models finetuned to write insecure code given a trigger become misaligned only when that trigger is present. So the misalignment is hidden without knowledge of the trigger. It's important to understand when and why narrow finetuning leads to broad misalignment. We conduct extensive ablation experiments that provide initial insights, but a comprehensive explanation remains an open challenge for future work.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Cryptography and Security,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/P8HBY3PR/Betley et al. - 2026 - Emergent Misalignment Narrow finetuning can produce broadly misaligned LLMs.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/59NZHUNJ/2502.html}
}

@misc{bommasaniOpportunitiesRisksFoundation2022,
  title = {On the {{Opportunities}} and {{Risks}} of {{Foundation Models}}},
  author = {Bommasani, Rishi and Hudson, Drew A. and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S. and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and Brynjolfsson, Erik and Buch, Shyamal and Card, Dallas and Castellon, Rodrigo and Chatterji, Niladri and Chen, Annie and Creel, Kathleen and Davis, Jared Quincy and Demszky, Dora and Donahue, Chris and Doumbouya, Moussa and Durmus, Esin and Ermon, Stefano and Etchemendy, John and Ethayarajh, Kawin and {Fei-Fei}, Li and Finn, Chelsea and Gale, Trevor and Gillespie, Lauren and Goel, Karan and Goodman, Noah and Grossman, Shelby and Guha, Neel and Hashimoto, Tatsunori and Henderson, Peter and Hewitt, John and Ho, Daniel E. and Hong, Jenny and Hsu, Kyle and Huang, Jing and Icard, Thomas and Jain, Saahil and Jurafsky, Dan and Kalluri, Pratyusha and Karamcheti, Siddharth and Keeling, Geoff and Khani, Fereshte and Khattab, Omar and Koh, Pang Wei and Krass, Mark and Krishna, Ranjay and Kuditipudi, Rohith and Kumar, Ananya and Ladhak, Faisal and Lee, Mina and Lee, Tony and Leskovec, Jure and Levent, Isabelle and Li, Xiang Lisa and Li, Xuechen and Ma, Tengyu and Malik, Ali and Manning, Christopher D. and Mirchandani, Suvir and Mitchell, Eric and Munyikwa, Zanele and Nair, Suraj and Narayan, Avanika and Narayanan, Deepak and Newman, Ben and Nie, Allen and Niebles, Juan Carlos and Nilforoshan, Hamed and Nyarko, Julian and Ogut, Giray and Orr, Laurel and Papadimitriou, Isabel and Park, Joon Sung and Piech, Chris and Portelance, Eva and Potts, Christopher and Raghunathan, Aditi and Reich, Rob and Ren, Hongyu and Rong, Frieda and Roohani, Yusuf and Ruiz, Camilo and Ryan, Jack and R{\'e}, Christopher and Sadigh, Dorsa and Sagawa, Shiori and Santhanam, Keshav and Shih, Andy and Srinivasan, Krishnan and Tamkin, Alex and Taori, Rohan and Thomas, Armin W. and Tram{\`e}r, Florian and Wang, Rose E. and Wang, William and Wu, Bohan and Wu, Jiajun and Wu, Yuhuai and Xie, Sang Michael and Yasunaga, Michihiro and You, Jiaxuan and Zaharia, Matei and Zhang, Michael and Zhang, Tianyi and Zhang, Xikun and Zhang, Yuhui and Zheng, Lucia and Zhou, Kaitlyn and Liang, Percy},
  year = 2022,
  month = jul,
  number = {arXiv:2108.07258},
  eprint = {2108.07258},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2108.07258},
  urldate = {2026-03-21},
  abstract = {AI is undergoing a paradigm shift with the rise of models (e.g., BERT, DALL-E, GPT-3) that are trained on broad data at scale and are adaptable to a wide range of downstream tasks. We call these models foundation models to underscore their critically central yet incomplete character. This report provides a thorough account of the opportunities and risks of foundation models, ranging from their capabilities (e.g., language, vision, robotics, reasoning, human interaction) and technical principles(e.g., model architectures, training procedures, data, systems, security, evaluation, theory) to their applications (e.g., law, healthcare, education) and societal impact (e.g., inequity, misuse, economic and environmental impact, legal and ethical considerations). Though foundation models are based on standard deep learning and transfer learning, their scale results in new emergent capabilities,and their effectiveness across so many tasks incentivizes homogenization. Homogenization provides powerful leverage but demands caution, as the defects of the foundation model are inherited by all the adapted models downstream. Despite the impending widespread deployment of foundation models, we currently lack a clear understanding of how they work, when they fail, and what they are even capable of due to their emergent properties. To tackle these questions, we believe much of the critical research on foundation models will require deep interdisciplinary collaboration commensurate with their fundamentally sociotechnical nature.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/UQM7FKXJ/Bommasani et al. - 2022 - On the Opportunities and Risks of Foundation Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/HEH7FUBP/2108.html}
}

@incollection{bostromEthicsArtificialIntelligence2014,
  title = {The Ethics of Artificial Intelligence},
  booktitle = {The {{Cambridge Handbook}} of {{Artificial Intelligence}}},
  author = {Bostrom, Nick and Yudkowsky, Eliezer},
  editor = {Frankish, Keith and Ramsey, William M.},
  year = 2014,
  pages = {316--334},
  publisher = {Cambridge University Press},
  address = {Cambridge},
  doi = {10.1017/CBO9781139046855.020},
  urldate = {2026-03-21},
  abstract = {This chapter surveys some of the ethical challenges that may arise as one can create artificial intelligences (AI) of various kinds and degrees. Some challenges of machine ethics are much like many other challenges involved in designing machines. There is nearly universal agreement among modern AI professionals that artificial intelligence falls short of human capabilities in some critical sense, even though AI algorithms have beaten humans in many specific domains such as chess. In creating a superhuman chess player, the human programmers necessarily sacrificed their ability to predict Deep Blue's local, specific game behavior. A different set of ethical issues arises when one can contemplate the possibility that some future AI systems might be candidates for having moral status. One also has moral reasons to treat them in certain ways, and to refrain from treating them in certain other ways. Superintelligence may be achievable by increasing processing speed.},
  isbn = {978-0-521-87142-6},
  keywords = {artificial intelligences,Deep Blue's programmers,ethical challenges,machine ethics,moral status,superhuman chess player,superintelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/DVIEEAPV/B46D2A9DF7CF3A9D92601D9A8ADA58A8.html}
}

@book{bostromSuperintelligencePathsDangers2014a,
  title = {Superintelligence: {{Paths}}, {{Dangers}}, {{Strategies}}},
  shorttitle = {Superintelligence},
  author = {Bostrom, Nick},
  year = 2014,
  publisher = {Oxford University Press},
  abstract = {The human brain has some capabilities that the brains of other animals lack. It is to these distinctive capabilities that our species owes its dominant position. Other animals have stronger muscles or sharper claws, but we have cleverer brains. If machine brains one day come to surpass human brains in general intelligence, then this new superintelligence could become very powerful. As the fate of the gorillas now depends more on us humans than on the gorillas themselves, so the fate of our species then would come to depend on the actions of the machine superintelligence. But we have one advantage: we get to make the first move. Will it be possible to construct a seed AI or otherwise to engineer initial conditions so as to make an intelligence explosion survivable? How could one achieve a controlled detonation? To get closer to an answer to this question, we must make our way through a fascinating landscape of topics and considerations. Read the book and learn about oracles, genies, singletons; about boxing methods, tripwires, and mind crime; about humanity's cosmic endowment and differential technological development; indirect normativity, instrumental convergence, whole brain emulation and technology couplings; Malthusian economics and dystopian evolution; artificial intelligence, and biological cognitive enhancement, and collective intelligence.},
  googlebooks = {7\_H8AwAAQBAJ},
  isbn = {978-0-19-967811-2},
  langid = {english},
  keywords = {Computers / Artificial Intelligence / General,Computers / Computer Science,Computers / General,Computers / Human-Computer Interaction (HCI),Computers / Social Aspects,Mathematics / Game Theory,Social Science / Future Studies}
}

@article{bostromVulnerableWorldHypothesis2019,
  title = {The {{Vulnerable World Hypothesis}}},
  author = {Bostrom, Nick},
  year = 2019,
  journal = {Global Policy},
  volume = {10},
  number = {4},
  pages = {455--476},
  issn = {1758-5899},
  doi = {10.1111/1758-5899.12718},
  urldate = {2025-12-24},
  abstract = {Scientific and technological progress might change people's capabilities or incentives in ways that would destabilize civilization. For example, advances in DIY biohacking tools might make it easy for anybody with basic training in biology to kill millions; novel military technologies could trigger arms races in which whoever strikes first has a decisive advantage; or some economically advantageous process may be invented that produces disastrous negative global externalities that are hard to regulate. This paper introduces the concept of a vulnerable world: roughly, one in which there is some level of technological development at which civilization almost certainly gets devastated by default, i.e. unless it has exited the `semi-anarchic default condition'. Several counterfactual historical and speculative future vulnerabilities are analyzed and arranged into a typology. A general ability to stabilize a vulnerable world would require greatly amplified capacities for preventive policing and global governance. The vulnerable world hypothesis thus offers a new perspective from which to evaluate the risk-benefit balance of developments towards ubiquitous surveillance or a unipolar world order.},
  copyright = {\copyright{} 2019 The Authors. Global Policy published by Durham University and John Wiley \& Sons Ltd.},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/AUUB5LUE/Bostrom - 2019 - The Vulnerable World Hypothesis.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/CIUSZXWS/1758-5899.html}
}

@misc{bowmanMeasuringProgressScalable2022,
  title = {Measuring {{Progress}} on {{Scalable Oversight}} for {{Large Language Models}}},
  author = {Bowman, Samuel R. and Hyun, Jeeyoon and Perez, Ethan and Chen, Edwin and Pettit, Craig and Heiner, Scott and Luko{\v s}i{\=u}t{\.e}, Kamil{\.e} and Askell, Amanda and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and Olah, Christopher and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and {Tran-Johnson}, Eli and Kernion, Jackson and Kerr, Jamie and Mueller, Jared and Ladish, Jeffrey and Landau, Joshua and Ndousse, Kamal and Lovitt, Liane and Elhage, Nelson and Schiefer, Nicholas and Joseph, Nicholas and Mercado, Noem{\'i} and DasSarma, Nova and Larson, Robin and McCandlish, Sam and Kundu, Sandipan and Johnston, Scott and Kravec, Shauna and Showk, Sheer El and Fort, Stanislav and {Telleen-Lawton}, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and {Hatfield-Dodds}, Zac and Mann, Ben and Kaplan, Jared},
  year = 2022,
  month = nov,
  number = {arXiv:2211.03540},
  eprint = {2211.03540},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2211.03540},
  urldate = {2025-12-23},
  abstract = {Developing safe and useful general-purpose AI systems will require us to make progress on scalable oversight: the problem of supervising systems that potentially outperform us on most skills relevant to the task at hand. Empirical work on this problem is not straightforward, since we do not yet have systems that broadly exceed our abilities. This paper discusses one of the major ways we think about this problem, with a focus on ways it can be studied empirically. We first present an experimental design centered on tasks for which human specialists succeed but unaided humans and current general AI systems fail. We then present a proof-of-concept experiment meant to demonstrate a key feature of this experimental design and show its viability with two question-answering tasks: MMLU and time-limited QuALITY. On these tasks, we find that human participants who interact with an unreliable large-language-model dialog assistant through chat -- a trivial baseline strategy for scalable oversight -- substantially outperform both the model alone and their own unaided performance. These results are an encouraging sign that scalable oversight will be tractable to study with present models and bolster recent findings that large language models can productively assist humans with difficult tasks.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Human-Computer Interaction},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/DRFKV2P3/Bowman et al. - 2022 - Measuring Progress on Scalable Oversight for Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/7UGAHS3M/2211.html}
}

@article{bradleyRANKANALYSISINCOMPLETE1952,
  title = {{{RANK ANALYSIS OF INCOMPLETE BLOCK DESIGNS}}: {{THE METHOD OF PAIRED COMPARISONS}}},
  shorttitle = {{{RANK ANALYSIS OF INCOMPLETE BLOCK DESIGNS}}},
  author = {BRADLEY, RALPH ALLAN and TERRY, MILTON E.},
  year = 1952,
  month = dec,
  journal = {Biometrika},
  volume = {39},
  number = {3-4},
  pages = {324--345},
  issn = {0006-3444},
  doi = {10.1093/biomet/39.3-4.324},
  urldate = {2025-12-15},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/BIV42Y65/39.3-4.html}
}

@book{bratmanIntentionPlansPractical1987,
  title = {Intention, Plans, and Practical Reason},
  author = {Bratman, Michael},
  year = 1987,
  publisher = {Cambridge, Mass. : Harvard University Press},
  urldate = {2026-02-03},
  abstract = {viii, 200 p. ; 25 cm; Bibliography: p. [169]-173; Includes index},
  collaborator = {{Internet Archive}},
  isbn = {978-0-674-45818-5},
  langid = {english},
  keywords = {Intentionality (Philosophy)}
}

@misc{brownEchoChambersRabbit2022,
  type = {{{SSRN Scholarly Paper}}},
  title = {Echo {{Chambers}}, {{Rabbit Holes}}, and {{Algorithmic Bias}}: {{How YouTube Recommends Content}} to {{Real Users}}},
  shorttitle = {Echo {{Chambers}}, {{Rabbit Holes}}, and {{Algorithmic Bias}}},
  author = {Brown, Megan A. and Bisbee, James and Lai, Angela and Bonneau, Richard and Nagler, Jonathan and Tucker, Joshua A.},
  year = 2022,
  month = may,
  number = {4114905},
  eprint = {4114905},
  publisher = {Social Science Research Network},
  address = {Rochester, NY},
  doi = {10.2139/ssrn.4114905},
  urldate = {2025-11-21},
  abstract = {To what extent does the YouTube recommendation algorithm push users into echo chambers, ideologically biased content, or rabbit holes? Using a novel method to estimate the ideology of YouTube videos and an original experimental design to isolate the effect of the algorithm from user choice, we demonstrate that the YouTube recommendation algorithm does, in fact, push real users into mild ideological echo chambers where, by the end of the data collection task, liberals and conservatives received different distributions of recommendations from each other, though this difference is small. While we find evidence that this difference increases the longer the user followed the recommendation algorithm, we do not find evidence that many go down `rabbit holes' that lead them to ideologically extreme content. Finally, we find that YouTube pushes all users, regardless of ideology, towards moderately conservative and an increasingly narrow range of ideological content the longer they follow YouTube's recommendations.},
  archiveprefix = {Social Science Research Network},
  langid = {english},
  keywords = {Echo Chambers,Political Polarization,Recommendation Algorithm,Theory Testing,YouTube},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/LHQWU65R/Brown et al. - 2022 - Echo Chambers, Rabbit Holes, and Algorithmic Bias How YouTube Recommends Content to Real Users.pdf}
}

@misc{brownLanguageModelsAre2020,
  title = {Language {{Models}} Are {{Few-Shot Learners}}},
  author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and {Herbert-Voss}, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
  year = 2020,
  month = jul,
  number = {arXiv:2005.14165},
  eprint = {2005.14165},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2005.14165},
  urldate = {2025-11-26},
  abstract = {Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/P29I9EQI/Brown et al. - 2020 - Language Models are Few-Shot Learners.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/GWE2H7I9/2005.html}
}

@book{brunswikPerceptionRepresentativeDesign1956a,
  title = {Perception and the {{Representative Design}} of {{Psychological Experiments}}},
  author = {BRUNSWIK, {\relax EGON}},
  year = 1956,
  edition = {DGO - Digital original, 1},
  eprint = {jj.8501445},
  eprinttype = {jstor},
  publisher = {University of California Press},
  doi = {10.2307/jj.8501445},
  urldate = {2026-02-04},
  abstract = {This title is part of UC Press's Voices Revived program, which commemorates University of California Press's mission to seek out and cultivate the brightest minds and give them voice, reach, and impact. Drawing on a backlist dating to 1893, Voices Revived makes high-quality, peer-reviewed scholarship accessible once again using print-on-demand technology. This title was originally published in 1956.{$<$}BR/{$>$}This title is part of UC Press's Voices Revived program, which commemorates University of California Press's mission to seek out and cultivate the brightest minds and give them voice, reach, and impact. Drawing on a backlist dating to 1893, Voices Revived},
  isbn = {null}
}

@misc{bubeckUniversalLawRobustness2022,
  title = {A {{Universal Law}} of {{Robustness}} via {{Isoperimetry}}},
  author = {Bubeck, S{\'e}bastien and Sellke, Mark},
  year = 2022,
  month = dec,
  number = {arXiv:2105.12806},
  eprint = {2105.12806},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2105.12806},
  urldate = {2025-12-23},
  abstract = {Classically, data interpolation with a parametrized model class is possible as long as the number of parameters is larger than the number of equations to be satisfied. A puzzling phenomenon in deep learning is that models are trained with many more parameters than what this classical theory would suggest. We propose a partial theoretical explanation for this phenomenon. We prove that for a broad class of data distributions and model classes, overparametrization is necessary if one wants to interpolate the data smoothly. Namely we show that smooth interpolation requires \$d\$ times more parameters than mere interpolation, where \$d\$ is the ambient data dimension. We prove this universal law of robustness for any smoothly parametrized function class with polynomial size weights, and any covariate distribution verifying isoperimetry. In the case of two-layers neural networks and Gaussian covariates, this law was conjectured in prior work by Bubeck, Li and Nagaraj. We also give an interpretation of our result as an improved generalization bound for model classes consisting of smooth functions.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/G5KLWJQ5/Bubeck and Sellke - 2022 - A Universal Law of Robustness via Isoperimetry.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/EC8PUS6X/2105.html}
}

@misc{burnsDiscoveringLatentKnowledge2024,
  title = {Discovering {{Latent Knowledge}} in {{Language Models Without Supervision}}},
  author = {Burns, Collin and Ye, Haotian and Klein, Dan and Steinhardt, Jacob},
  year = 2024,
  month = mar,
  number = {arXiv:2212.03827},
  eprint = {2212.03827},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2212.03827},
  urldate = {2026-02-08},
  abstract = {Existing techniques for training language models can be misaligned with the truth: if we train models with imitation learning, they may reproduce errors that humans make; if we train them to generate text that humans rate highly, they may output errors that human evaluators can't detect. We propose circumventing this issue by directly finding latent knowledge inside the internal activations of a language model in a purely unsupervised way. Specifically, we introduce a method for accurately answering yes-no questions given only unlabeled model activations. It works by finding a direction in activation space that satisfies logical consistency properties, such as that a statement and its negation have opposite truth values. We show that despite using no supervision and no model outputs, our method can recover diverse knowledge represented in large language models: across 6 models and 10 question-answering datasets, it outperforms zero-shot accuracy by 4\textbackslash\% on average. We also find that it cuts prompt sensitivity in half and continues to maintain high accuracy even when models are prompted to generate incorrect answers. Our results provide an initial step toward discovering what language models know, distinct from what they say, even when we don't have access to explicit ground truth labels.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/HG8MN7LB/Burns et al. - 2024 - Discovering Latent Knowledge in Language Models Without Supervision.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/LXI7ZY64/2212.html}
}

@misc{butlinConsciousnessArtificialIntelligence2023,
  title = {Consciousness in {{Artificial Intelligence}}: {{Insights}} from the {{Science}} of {{Consciousness}}},
  shorttitle = {Consciousness in {{Artificial Intelligence}}},
  author = {Butlin, Patrick and Long, Robert and Elmoznino, Eric and Bengio, Yoshua and Birch, Jonathan and Constant, Axel and Deane, George and Fleming, Stephen M. and Frith, Chris and Ji, Xu and Kanai, Ryota and Klein, Colin and Lindsay, Grace and Michel, Matthias and Mudrik, Liad and Peters, Megan A. K. and Schwitzgebel, Eric and Simon, Jonathan and VanRullen, Rufin},
  year = 2023,
  month = aug,
  number = {arXiv:2308.08708},
  eprint = {2308.08708},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2308.08708},
  urldate = {2025-12-07},
  abstract = {Whether current or near-term AI systems could be conscious is a topic of scientific interest and increasing public concern. This report argues for, and exemplifies, a rigorous and empirically grounded approach to AI consciousness: assessing existing AI systems in detail, in light of our best-supported neuroscientific theories of consciousness. We survey several prominent scientific theories of consciousness, including recurrent processing theory, global workspace theory, higher-order theories, predictive processing, and attention schema theory. From these theories we derive "indicator properties" of consciousness, elucidated in computational terms that allow us to assess AI systems for these properties. We use these indicator properties to assess several recent AI systems, and we discuss how future systems might implement them. Our analysis suggests that no current AI systems are conscious, but also suggests that there are no obvious technical barriers to building AI systems which satisfy these indicators.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning,Quantitative Biology - Neurons and Cognition},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/792989KQ/Butlin et al. - 2023 - Consciousness in Artificial Intelligence Insights from the Science of Consciousness.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/U87WRABV/2308.html}
}

@misc{buylLargeLanguageModels2025,
  title = {Large {{Language Models Reflect}} the {{Ideology}} of Their {{Creators}}},
  author = {Buyl, Maarten and Rogiers, Alexander and Noels, Sander and Bied, Guillaume and {Dominguez-Catena}, Iris and Heiter, Edith and Johary, Iman and Mara, Alexandru-Cristian and Romero, Rapha{\"e}l and Lijffijt, Jefrey and Bie, Tijl De},
  year = 2025,
  month = jan,
  number = {arXiv:2410.18417},
  eprint = {2410.18417},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2410.18417},
  urldate = {2026-01-25},
  abstract = {Large language models (LLMs) are trained on vast amounts of data to generate natural language, enabling them to perform tasks like text summarization and question answering. These models have become popular in artificial intelligence (AI) assistants like ChatGPT and already play an influential role in how humans access information. However, the behavior of LLMs varies depending on their design, training, and use. In this paper, we prompt a diverse panel of popular LLMs to describe a large number of prominent personalities with political relevance, in all six official languages of the United Nations. By identifying and analyzing moral assessments reflected in their responses, we find normative differences between LLMs from different geopolitical regions, as well as between the responses of the same LLM when prompted in different languages. Among only models in the United States, we find that popularly hypothesized disparities in political views are reflected in significant normative differences related to progressive values. Among Chinese models, we characterize a division between internationally- and domestically-focused models. Our results show that the ideological stance of an LLM appears to reflect the worldview of its creators. This poses the risk of political instrumentalization and raises concerns around technological and regulatory efforts with the stated aim of making LLMs ideologically 'unbiased'.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/VI8RIXIE/Buyl et al. - 2025 - Large Language Models Reflect the Ideology of their Creators.pdf}
}

@book{byrneAttractionParadigm1971,
  title = {The Attraction Paradigm},
  author = {Byrne, Donn Erwin},
  year = 1971,
  publisher = {New York, Academic Press},
  urldate = {2026-01-13},
  abstract = {xi, 474 pages 24 cm; Includes bibliographical references (pages 443-462)},
  collaborator = {{Internet Archive}},
  isbn = {978-0-12-148650-1},
  langid = {english},
  keywords = {Interpersonal attraction}
}

@misc{bytez.comScalingTrendsLanguage2025,
  title = {Scaling {{Trends}} in {{Language Model Robustness}} \textbar{} {{Read Paper}}...},
  author = {Bytez.com and Howe, Nikolaus H. R. and McKenzie, Ian R. and Hollinsworth, Oskar John and Zaj{\k a}c, Micha{\l} and Tseng, Tom and Tucker, Aaron David and Bacon, Pierre-Luc and Gleave, Adam},
  year = 2025,
  month = jul,
  urldate = {2026-01-05},
  abstract = {Researchers studied how the size of language models affects their ability to resist attacks, like being tricked by harmful prompts. They found that simply making these models larger doesn't always...},
  howpublished = {https://bytez.com/docs/icml/43784/paper},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/BX4FD6XM/paper.html}
}

@misc{carlsmithPowerSeekingAIExistential2024a,
  title = {Is {{Power-Seeking AI}} an {{Existential Risk}}?},
  author = {Carlsmith, Joseph},
  year = 2024,
  month = aug,
  number = {arXiv:2206.13353},
  eprint = {2206.13353},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2206.13353},
  urldate = {2026-01-19},
  abstract = {This report examines what I see as the core argument for concern about existential risk from misaligned artificial intelligence. I proceed in two stages. First, I lay out a backdrop picture that informs such concern. On this picture, intelligent agency is an extremely powerful force, and creating agents much more intelligent than us is playing with fire -- especially given that if their objectives are problematic, such agents would plausibly have instrumental incentives to seek power over humans. Second, I formulate and evaluate a more specific six-premise argument that creating agents of this kind will lead to existential catastrophe by 2070. On this argument, by 2070: (1) it will become possible and financially feasible to build relevantly powerful and agentic AI systems; (2) there will be strong incentives to do so; (3) it will be much harder to build aligned (and relevantly powerful/agentic) AI systems than to build misaligned (and relevantly powerful/agentic) AI systems that are still superficially attractive to deploy; (4) some such misaligned systems will seek power over humans in high-impact ways; (5) this problem will scale to the full disempowerment of humanity; and (6) such disempowerment will constitute an existential catastrophe. I assign rough subjective credences to the premises in this argument, and I end up with an overall estimate of \textasciitilde 5\% that an existential catastrophe of this kind will occur by 2070. (May 2022 update: since making this report public in April 2021, my estimate here has gone up, and is now at {$>$}10\%.)},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/33UZDYRR/Carlsmith - 2024 - Is Power-Seeking AI an Existential Risk.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/YKUVUBCK/2206.html}
}

@misc{casperOpenProblemsFundamental2023,
  title = {Open {{Problems}} and {{Fundamental Limitations}} of {{Reinforcement Learning}} from {{Human Feedback}}},
  author = {Casper, Stephen and Davies, Xander and Shi, Claudia and Gilbert, Thomas Krendl and Scheurer, J{\'e}r{\'e}my and Rando, Javier and Freedman, Rachel and Korbak, Tomasz and Lindner, David and Freire, Pedro and Wang, Tony and Marks, Samuel and Segerie, Charbel-Rapha{\"e}l and Carroll, Micah and Peng, Andi and Christoffersen, Phillip and Damani, Mehul and Slocum, Stewart and Anwar, Usman and Siththaranjan, Anand and Nadeau, Max and Michaud, Eric J. and Pfau, Jacob and Krasheninnikov, Dmitrii and Chen, Xin and Langosco, Lauro and Hase, Peter and B{\i}y{\i}k, Erdem and Dragan, Anca and Krueger, David and Sadigh, Dorsa and {Hadfield-Menell}, Dylan},
  year = 2023,
  month = sep,
  number = {arXiv:2307.15217},
  eprint = {2307.15217},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2307.15217},
  urldate = {2025-12-01},
  abstract = {Reinforcement learning from human feedback (RLHF) is a technique for training AI systems to align with human goals. RLHF has emerged as the central method used to finetune state-of-the-art large language models (LLMs). Despite this popularity, there has been relatively little public work systematizing its flaws. In this paper, we (1) survey open problems and fundamental limitations of RLHF and related methods; (2) overview techniques to understand, improve, and complement RLHF in practice; and (3) propose auditing and disclosure standards to improve societal oversight of RLHF systems. Our work emphasizes the limitations of RLHF and highlights the importance of a multi-faceted approach to the development of safer AI systems.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/887C4SSF/Casper et al. - 2023 - Open Problems and Fundamental Limitations of Reinforcement Learning from Human Feedback.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/CIYI6PS7/2307.html}
}

@inproceedings{chanHarmsIncreasinglyAgentic2023,
  title = {Harms from {{Increasingly Agentic Algorithmic Systems}}},
  booktitle = {Proceedings of the 2023 {{ACM Conference}} on {{Fairness}}, {{Accountability}}, and {{Transparency}}},
  author = {Chan, Alan and Salganik, Rebecca and Markelius, Alva and Pang, Chris and Rajkumar, Nitarshan and Krasheninnikov, Dmitrii and Langosco, Lauro and He, Zhonghao and Duan, Yawen and Carroll, Micah and Lin, Michelle and Mayhew, Alex and Collins, Katherine and Molamohammadi, Maryam and Burden, John and Zhao, Wanru and Rismani, Shalaleh and Voudouris, Konstantinos and Bhatt, Umang and Weller, Adrian and Krueger, David and Maharaj, Tegan},
  year = 2023,
  month = jun,
  series = {{{FAccT}} '23},
  pages = {651--666},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  doi = {10.1145/3593013.3594033},
  urldate = {2026-02-03},
  abstract = {Research in Fairness, Accountability, Transparency, and Ethics (FATE)1 has established many sources and forms of algorithmic harm, in domains as diverse as health care, finance, policing, and recommendations. Much work remains to be done to mitigate the serious harms of these systems, particularly those disproportionately affecting marginalized communities. Despite these ongoing harms, new systems are being developed and deployed, typically without strong regulatory barriers, threatening the perpetuation of the same harms and the creation of novel ones. In response, the FATE community has emphasized the importance of anticipating harms, rather than just responding to them. Anticipation of harms is especially important given the rapid pace of developments in machine learning (ML). Our work focuses on the anticipation of harms from increasingly agentic systems. Rather than providing a definition of agency as a binary property, we identify 4 key characteristics which, particularly in combination, tend to increase the agency of a given algorithmic system: underspecification, directness of impact, goal-directedness, and long-term planning. We also discuss important harms which arise from increasing agency -- notably, these include systemic and/or long-range impacts, often on marginalized or unconsidered stakeholders. We emphasize that recognizing agency of algorithmic systems does not absolve or shift the human responsibility for algorithmic harms. Rather, we use the term agency to highlight the increasingly evident fact that ML systems are not fully under human control. Our work explores increasingly agentic algorithmic systems in three parts. First, we explain the notion of an increase in agency for algorithmic systems in the context of diverse perspectives on agency across disciplines. Second, we argue for the need to anticipate harms from increasingly agentic systems. Third, we discuss important harms from increasingly agentic systems and ways forward for addressing them. We conclude by reflecting on implications of our work for anticipating algorithmic harms from emerging systems.},
  isbn = {979-8-4007-0192-4},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/9I6RURCR/Chan et al. - 2023 - Harms from Increasingly Agentic Algorithmic Systems.pdf}
}

@misc{cheeQuIP2BitQuantization2024,
  title = {{{QuIP}}: 2-{{Bit Quantization}} of {{Large Language Models With Guarantees}}},
  shorttitle = {{{QuIP}}},
  author = {Chee, Jerry and Cai, Yaohui and Kuleshov, Volodymyr and Sa, Christopher De},
  year = 2024,
  month = jan,
  number = {arXiv:2307.13304},
  eprint = {2307.13304},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2307.13304},
  urldate = {2026-01-25},
  abstract = {This work studies post-training parameter quantization in large language models (LLMs). We introduce quantization with incoherence processing (QuIP), a new method based on the insight that quantization benefits from \$\textbackslash textit\textbraceleft incoherent\textbraceright\$ weight and Hessian matrices, i.e., from the weights being even in magnitude and the directions in which it is important to round them accurately being unaligned with the coordinate axes. QuIP consists of two steps: (1) an adaptive rounding procedure minimizing a quadratic proxy objective; (2) efficient pre- and post-processing that ensures weight and Hessian incoherence via multiplication by random orthogonal matrices. We complement QuIP with the first theoretical analysis for an LLM-scale quantization algorithm, and show that our theory also applies to an existing method, OPTQ. Empirically, we find that our incoherence preprocessing improves several existing quantization algorithms and yields the first LLM quantization methods that produce viable results using only two bits per weight. Our code can be found at https://github.com/Cornell-RelaxML/QuIP.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/DMZJ9AR5/Chee et al. - 2024 - QuIP 2-Bit Quantization of Large Language Models With Guarantees.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/HIT8P5Y7/2307.html}
}

@misc{chenEvaluatingLargeLanguage2021,
  title = {Evaluating {{Large Language Models Trained}} on {{Code}}},
  author = {Chen, Mark and Tworek, Jerry and Jun, Heewoo and Yuan, Qiming and Pinto, Henrique Ponde de Oliveira and Kaplan, Jared and Edwards, Harri and Burda, Yuri and Joseph, Nicholas and Brockman, Greg and Ray, Alex and Puri, Raul and Krueger, Gretchen and Petrov, Michael and Khlaaf, Heidy and Sastry, Girish and Mishkin, Pamela and Chan, Brooke and Gray, Scott and Ryder, Nick and Pavlov, Mikhail and Power, Alethea and Kaiser, Lukasz and Bavarian, Mohammad and Winter, Clemens and Tillet, Philippe and Such, Felipe Petroski and Cummings, Dave and Plappert, Matthias and Chantzis, Fotios and Barnes, Elizabeth and {Herbert-Voss}, Ariel and Guss, William Hebgen and Nichol, Alex and Paino, Alex and Tezak, Nikolas and Tang, Jie and Babuschkin, Igor and Balaji, Suchir and Jain, Shantanu and Saunders, William and Hesse, Christopher and Carr, Andrew N. and Leike, Jan and Achiam, Josh and Misra, Vedant and Morikawa, Evan and Radford, Alec and Knight, Matthew and Brundage, Miles and Murati, Mira and Mayer, Katie and Welinder, Peter and McGrew, Bob and Amodei, Dario and McCandlish, Sam and Sutskever, Ilya and Zaremba, Wojciech},
  year = 2021,
  month = jul,
  number = {arXiv:2107.03374},
  eprint = {2107.03374},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2107.03374},
  urldate = {2026-02-01},
  abstract = {We introduce Codex, a GPT language model fine-tuned on publicly available code from GitHub, and study its Python code-writing capabilities. A distinct production version of Codex powers GitHub Copilot. On HumanEval, a new evaluation set we release to measure functional correctness for synthesizing programs from docstrings, our model solves 28.8\% of the problems, while GPT-3 solves 0\% and GPT-J solves 11.4\%. Furthermore, we find that repeated sampling from the model is a surprisingly effective strategy for producing working solutions to difficult prompts. Using this method, we solve 70.2\% of our problems with 100 samples per problem. Careful investigation of our model reveals its limitations, including difficulty with docstrings describing long chains of operations and with binding operations to variables. Finally, we discuss the potential broader impacts of deploying powerful code generation technologies, covering safety, security, and economics.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/I2T5X6AG/Chen et al. - 2021 - Evaluating Large Language Models Trained on Code.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/QY7JKWH7/2107.html}
}

@misc{chenReasoningModelsDont2025a,
  title = {Reasoning {{Models Don}}'t {{Always Say What They Think}}},
  author = {Chen, Yanda and Benton, Joe and Radhakrishnan, Ansh and Uesato, Jonathan and Denison, Carson and Schulman, John and Somani, Arushi and Hase, Peter and Wagner, Misha and Roger, Fabien and Mikulik, Vlad and Bowman, Samuel R. and Leike, Jan and Kaplan, Jared and Perez, Ethan},
  year = 2025,
  month = may,
  number = {arXiv:2505.05410},
  eprint = {2505.05410},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2505.05410},
  urldate = {2025-10-28},
  abstract = {Chain-of-thought (CoT) offers a potential boon for AI safety as it allows monitoring a model's CoT to try to understand its intentions and reasoning processes. However, the effectiveness of such monitoring hinges on CoTs faithfully representing models' actual reasoning processes. We evaluate CoT faithfulness of state-of-the-art reasoning models across 6 reasoning hints presented in the prompts and find: (1) for most settings and models tested, CoTs reveal their usage of hints in at least 1\% of examples where they use the hint, but the reveal rate is often below 20\%, (2) outcome-based reinforcement learning initially improves faithfulness but plateaus without saturating, and (3) when reinforcement learning increases how frequently hints are used (reward hacking), the propensity to verbalize them does not increase, even without training against a CoT monitor. These results suggest that CoT monitoring is a promising way of noticing undesired behaviors during training and evaluations, but that it is not sufficient to rule them out. They also suggest that in settings like ours where CoT reasoning is not necessary, test-time monitoring of CoTs is unlikely to reliably catch rare and catastrophic unexpected behaviors.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/ZVHRW8LQ/Chen et al. - 2025 - Reasoning Models Don't Always Say What They Think.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/CMLH2PWD/2505.html}
}

@misc{christianoClarifyingAIAlignment2018,
  title = {Clarifying ``{{AI}} Alignment''},
  author = {Christiano, Paul},
  year = 2018,
  month = apr,
  journal = {AI alignment},
  urldate = {2026-01-27},
  abstract = {Clarifying what I mean when I say that an AI is aligned.},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/CCG5NCT8/clarifying-ai-alignment-cec47cd69dd6.html}
}

@misc{christianoDeepReinforcementLearning2017,
  title = {Deep Reinforcement Learning from Human Preferences},
  author = {Christiano, Paul and Leike, Jan and Brown, Tom B. and Martic, Miljan and Legg, Shane and Amodei, Dario},
  year = 2017,
  month = jun,
  number = {arXiv:1706.03741},
  eprint = {1706.03741},
  primaryclass = {stat},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1706.03741},
  urldate = {2025-10-28},
  abstract = {For sophisticated reinforcement learning (RL) systems to interact usefully with real-world environments, we need to communicate complex goals to these systems. In this work, we explore goals defined in terms of (non-expert) human preferences between pairs of trajectory segments. We show that this approach can effectively solve complex RL tasks without access to the reward function, including Atari games and simulated robot locomotion, while providing feedback on less than one percent of our agent's interactions with the environment. This reduces the cost of human oversight far enough that it can be practically applied to state-of-the-art RL systems. To demonstrate the flexibility of our approach, we show that we can successfully train complex novel behaviors with about an hour of human time. These behaviors and environments are considerably more complex than any that have been previously learned from human feedback.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Human-Computer Interaction,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/V554LB5Q/Christiano et al. - 2023 - Deep reinforcement learning from human preferences.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/3EVUHNM5/1706.html}
}

@misc{churchillwinstonsSpeechHouseCommons1947,
  title = {Speech to the {{House}} of {{Commons}}},
  author = {{Churchill, Winston S}},
  year = 1947,
  month = nov,
  address = {House of Commons}
}

@misc{CITELayeredDefense2025,
  title = {{{CITE}}: {{Layered Defense}} in {{AI Chat Safety}} - by {{James Padolsey}}},
  year = 2025,
  month = nov,
  urldate = {2025-11-18},
  howpublished = {https://blog.j11y.io/2025-11-13\_CITE-AI-Safety/},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/7WPJJET6/2025-11-13_CITE-AI-Safety.html}
}

@article{clarkSellingFamilyTin2011,
  title = {Selling {{The Family Tin}}? {{Rail Privatisation}} in {{New Zealand}}, in the {{Light}} of {{Wider Railway}} and {{Network Industry Experience}}},
  shorttitle = {Selling {{The Family Tin}}?},
  author = {Clark, Ross},
  year = 2011,
  month = aug,
  publisher = {Te Herenga Waka---Victoria University of Wellington},
  urldate = {2026-01-19},
  abstract = {When New Zealand's railway system was privatised in 1993 it was as one entity thus avoiding any issues arising from the 'separation of wheel and rail'. Yet this approach failed in time in that in 2003 the New Zealand Government had to come in and purchase the track in order to bail out the operator. In May 2008 it elected to purchase the operator outright as well rather than persevere with what had become a very difficult relationship. This presentation argues that the debates over the privatisation of rail (in New Zealand and elsewhere) have neglected two major considerations. First there have been few substantive treatments of how market failure would bear on the privatisation process. Second there have been no comparisons made with the privatisations in other transport and infrastructural industries which have worked in financial terms anyway. If 'railways are different' as many in this industry insist then the paper will argue that the difference derives from the extent to which the industry needs to be subsidised. While public policy can deal with competitive privately-owned industries which need subsidy (such as 'social' bus services) and privately-owned monopolies which don't need subsidy such as airports or telecommunications it is the combination of these two elements (subsidy and monopoly in a context of market failure) which explains why the privatisation of the railway network in New Zealand fundamentally did not work. The paper to which Ross is speaking can be found here: http://www.iscr.org.nz/f63017980/17980\_ETC\_2010\_paper.pdf Commentary provided by Dave Heatley New Zealand Productivity Commission. Ross Clark is Rail Performance Manager for Transport Scotland the local equivalent of the Transport Agency except that it also has direct oversight of the railway network in Scotland. He went to the UK from New Zealand at the start of 2005 partly to see what a decent-sized railway looks like. His professional background with Tranz Rail was as a business analyst in its Passenger Group. He also served for some years as an administration officer with New Zealand's highways and land transport agency and began his career in transport in the area of road safety economics. His other professional interests include getting to see first hand the world's best airports and for variety some of the really bad ones as well. Dave Heatley is a senior advisor at the New Zealand Productivity Commission currently at work on the international freight transport services inquiry. Prior to joining the Commission Dave was at Victoria University: firstly as an MBA student and subsequently as a Research Fellow for the Institute for the Study of Competition and Regulation. Here Dave produced original research on many topics including the Overseas Investment Act New Zealand railways state-owned enterprises and telecommunications policy. In addition to his first career as a computer programmer Dave has worked as an environmental lobbyist park ranger and entrepreneur. He founded a scientific software and aquaculture technology company that at its peak employed over 50 people. When not hard at work as an economist Dave can often be found on mountainsides - skiing climbing tramping or running.},
  langid = {newzealand},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/7TYD5QG4/Clark - 2011 - Selling The Family Tin Rail Privatisation in New Zealand, in the Light of Wider Railway and Network.pdf}
}

@misc{CollectiveConstitutionalAI2023,
  title = {Collective {{Constitutional AI}}: {{Aligning}} a {{Language Model}} with {{Public Input}}},
  shorttitle = {Collective {{Constitutional AI}}},
  year = 2023,
  month = oct,
  urldate = {2025-11-30},
  abstract = {Anthropic is an AI safety and research company that's working to build reliable, interpretable, and steerable AI systems.},
  howpublished = {https://www.anthropic.com/news/collective-constitutional-ai-aligning-a-language-model-with-public-input},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/VAIQD4D9/collective-constitutional-ai-aligning-a-language-model-with-public-input.html}
}

@misc{collectiveintelligenceprojectCIP+Whitepaper,
  title = {{{CIP Whitepaper}}},
  author = {{Collective Intelligence Project}},
  year = 2024,
  urldate = {2025-11-17},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/MYY4SQMA/CIP+Whitepaper.pdf}
}

@misc{CommonCrawl2025,
  title = {Common {{Crawl}}},
  year = 2025
}

@misc{compdemocracyPolis2025,
  title = {Polis},
  author = {{compdemocracy}},
  year = 2025,
  month = nov,
  urldate = {2025-11-16},
  abstract = {:milky\_way: Open Source AI for large scale open ended feedback},
  copyright = {AGPL-3.0},
  howpublished = {The Computational Democracy Project},
  keywords = {civic-tech,data-science,deliberative-democracy,participatory-democracy}
}

@misc{cottierRisingCostsTraining2024,
  title = {The Rising Costs of Training Frontier {{AI}} Models},
  author = {Cottier, Ben and Rahman, Robi and Fattorini, Loredana and Maslej, Nestor and Owen, David},
  year = 2024,
  month = may,
  number = {arXiv:2405.21015},
  eprint = {2405.21015},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2405.21015},
  urldate = {2026-01-25},
  abstract = {The costs of training frontier AI models have grown dramatically in recent years, but there is limited public data on the magnitude and growth of these expenses. This paper develops a detailed cost model to address this gap, estimating training costs using three approaches that account for hardware, energy, cloud rental, and staff expenses. The analysis reveals that the amortized cost to train the most compute-intensive models has grown precipitously at a rate of 2.4x per year since 2016 (95\% CI: 2.0x to 3.1x). For key frontier models, such as GPT-4 and Gemini, the most significant expenses are AI accelerator chips and staff costs, each costing tens of millions of dollars. Other notable costs include server components (15-22\%), cluster-level interconnect (9-13\%), and energy consumption (2-6\%). If the trend of growing development costs continues, the largest training runs will cost more than a billion dollars by 2027, meaning that only the most well-funded organizations will be able to finance frontier AI models.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computers and Society},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/VE34EYGF/Cottier et al. - 2024 - The rising costs of training frontier AI models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/BHXF3EHD/2405.html}
}

@book{courakisInflationDepressionEconomic1981,
  title = {Inflation, {{Depression}}, and {{Economic Policy}} in the {{West}}},
  author = {Courakis, Anthony S.},
  year = 1981,
  month = jan,
  publisher = {Rowman \& Littlefield},
  abstract = {Experience during the last ten years has encouraged economists to review their judgements regarding behavior and policy. The experience of the 1970s brought inflation to prominence in the minds of policymakers and academic economists, raising questions about labor markets and other supply considerations, but also resulting in an atmosphere conducive to increasing attention on monetary and financial variables. An account of some of the issues that, in this environment, occupied the thoughts of economists and conditioned the responses of policymakers in various Western countries is what this volume is about.},
  googlebooks = {OMe6UQxu1KcC},
  isbn = {978-0-389-20144-1},
  langid = {english},
  keywords = {Business & Economics / Development / Economic Development,Business & Economics / Economic History,Business & Economics / Economics / Comparative,Business & Economics / Economics / General,Business & Economics / Inflation,Business & Economics / Money & Monetary Policy,Political Science / Public Policy / Economic Policy}
}

@book{crawfordAtlasAIPower2021,
  title = {The {{Atlas}} of {{AI}}: {{Power}}, {{Politics}}, and the {{Planetary Costs}} of {{Artificial Intelligence}}},
  shorttitle = {The {{Atlas}} of {{AI}}},
  author = {Crawford, Kate},
  year = 2021,
  month = apr,
  publisher = {Yale University Press},
  abstract = {The hidden costs of artificial intelligence, from natural resources and labor to privacy and freedomWhat happens when artificial intelligence saturates political life and depletes the planet? How is AI shaping our understanding of ourselves and our societies? In this book Kate Crawford~reveals how this planetary network is fueling a shift toward undemocratic governance and increased inequality. Drawing on more than a decade of research, award-winning science, and technology, Crawford reveals how AI is a technology of extraction: from the energy and minerals needed to build and sustain its infrastructure, to the exploited workers behind ``automated'' services, to the data AI collects from us.~Rather than taking a narrow focus on code and algorithms, Crawford offers us a political and a material perspective on what it takes to make artificial intelligence and where it goes wrong. While technical systems present a veneer of objectivity, they are always systems of power. This is an urgent account of what is at stake as technology companies use artificial intelligence to reshape the world.},
  googlebooks = {XvEdEAAAQBAJ},
  isbn = {978-0-300-25239-2},
  langid = {english},
  keywords = {Business & Economics / Industries / Computers & Information Technology,Computers / Artificial Intelligence / General,Technology & Engineering / Telecommunications}
}

@misc{critchAIResearchConsiderations2020,
  title = {{{AI Research Considerations}} for {{Human Existential Safety}} ({{ARCHES}})},
  author = {Critch, Andrew and Krueger, David},
  year = 2020,
  month = may,
  number = {arXiv:2006.04948},
  eprint = {2006.04948},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2006.04948},
  urldate = {2025-12-23},
  abstract = {Framed in positive terms, this report examines how technical AI research might be steered in a manner that is more attentive to humanity's long-term prospects for survival as a species. In negative terms, we ask what existential risks humanity might face from AI development in the next century, and by what principles contemporary technical research might be directed to address those risks. A key property of hypothetical AI technologies is introduced, called \textbackslash emph\textbraceleft prepotence\textbraceright, which is useful for delineating a variety of potential existential risks from artificial intelligence, even as AI paradigms might shift. A set of \textbackslash auxref\textbraceleft dirtot\textbraceright{} contemporary research \textbackslash directions are then examined for their potential benefit to existential safety. Each research direction is explained with a scenario-driven motivation, and examples of existing work from which to build. The research directions present their own risks and benefits to society that could occur at various scales of impact, and in particular are not guaranteed to benefit existential safety if major developments in them are deployed without adequate forethought and oversight. As such, each direction is accompanied by a consideration of potentially negative side effects.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/3N7M5L9T/Critch and Krueger - 2020 - AI Research Considerations for Human Existential Safety (ARCHES).pdf;/home/james/snap/zotero-snap/common/Zotero/storage/3CH8F6K4/2006.html}
}

@article{crowneNewScaleSocial1960,
  title = {A New Scale of Social Desirability Independent of Psychopathology},
  author = {Crowne, D. P. and Marlowe, D.},
  year = 1960,
  month = aug,
  journal = {Journal of Consulting Psychology},
  volume = {24},
  pages = {349--354},
  issn = {0095-8891},
  doi = {10.1037/h0047358},
  langid = {english},
  pmid = {13813058},
  keywords = {Humans,Personality,PERSONALITY,Personality Disorders,Psychopathology,Social Desirability}
}

@article{daleSovereignAI20252025,
  title = {Sovereign {{AI}} in 2025},
  author = {Dale, Robert},
  year = 2025,
  month = sep,
  journal = {Natural Language Processing},
  volume = {31},
  number = {5},
  pages = {1312--1321},
  issn = {2977-0424},
  doi = {10.1017/nlp.2025.10007},
  urldate = {2026-01-06},
  abstract = {At the London~Tech~Week event in early June, Nvidia CEO Jensen Huang praised the UK as the `envy of the world' when it comes to AI researchers, but he also criticised it as the largest AI ecosystem in the world without its own infrastructure. The criticism is somewhat self-serving: when the UK does get around to building out that infrastructure, it's certain to consist largely of chips sold by Huang's company. It's also unsurprising: Huang has been pitching the idea of `sovereign AI' since~at~least~2023, conscious that nation states are the next deep pockets to target after the hyperscalers and generously funded model builders. In a world where the only real contenders in the race for AI supremacy are the US and China, we look at how the pursuit of AI sovereignty is playing out across the rest of the planet.},
  langid = {english},
  keywords = {AI,sovereignty},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/4MSKZWXK/Dale - 2025 - Sovereign AI in 2025.pdf}
}

@misc{dalrympleGuaranteedSafeAI2024,
  title = {Towards {{Guaranteed Safe AI}}: {{A Framework}} for {{Ensuring Robust}} and {{Reliable AI Systems}}},
  shorttitle = {Towards {{Guaranteed Safe AI}}},
  author = {Dalrymple, David "davidad" and Skalse, Joar and Bengio, Yoshua and Russell, Stuart and Tegmark, Max and Seshia, Sanjit and Omohundro, Steve and Szegedy, Christian and Goldhaber, Ben and Ammann, Nora and Abate, Alessandro and Halpern, Joe and Barrett, Clark and Zhao, Ding and {Zhi-Xuan}, Tan and Wing, Jeannette and Tenenbaum, Joshua},
  year = 2024,
  month = jul,
  number = {arXiv:2405.06624},
  eprint = {2405.06624},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2405.06624},
  urldate = {2025-12-22},
  abstract = {Ensuring that AI systems reliably and robustly avoid harmful or dangerous behaviours is a crucial challenge, especially for AI systems with a high degree of autonomy and general intelligence, or systems used in safety-critical contexts. In this paper, we will introduce and define a family of approaches to AI safety, which we will refer to as guaranteed safe (GS) AI. The core feature of these approaches is that they aim to produce AI systems which are equipped with high-assurance quantitative safety guarantees. This is achieved by the interplay of three core components: a world model (which provides a mathematical description of how the AI system affects the outside world), a safety specification (which is a mathematical description of what effects are acceptable), and a verifier (which provides an auditable proof certificate that the AI satisfies the safety specification relative to the world model). We outline a number of approaches for creating each of these three core components, describe the main technical challenges, and suggest a number of potential solutions to them. We also argue for the necessity of this approach to AI safety, and for the inadequacy of the main alternative approaches.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/6N67VYJK/Dalrymple et al. - 2024 - Towards Guaranteed Safe AI A Framework for Ensuring Robust and Reliable AI Systems.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/ZAK7A55W/2405.html}
}

@misc{danielkokotajloAI2027,
  title = {{{AI}} 2027},
  author = {{Daniel Kokotajlo} and {Scott Alexander} and {Thomas Larsen} and {Eli Lifland} and {Romeo Dean}},
  urldate = {2025-07-16},
  abstract = {A research-backed AI scenario forecast.},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/Y9CHSSTW/ai-2027.com.html}
}

@misc{deepseek-aiDeepSeekV32PushingFrontier2025,
  title = {{{DeepSeek-V3}}.2: {{Pushing}} the {{Frontier}} of {{Open Large Language Models}}},
  shorttitle = {{{DeepSeek-V3}}.2},
  author = {{DeepSeek-AI} and Liu, Aixin and Mei, Aoxue and Lin, Bangcai and Xue, Bing and Wang, Bingxuan and Xu, Bingzheng and Wu, Bochao and Zhang, Bowei and Lin, Chaofan and Dong, Chen and Lu, Chengda and Zhao, Chenggang and Deng, Chengqi and Xu, Chenhao and Ruan, Chong and Dai, Damai and Guo, Daya and Yang, Dejian and Chen, Deli and Li, Erhang and Zhou, Fangqi and Lin, Fangyun and Dai, Fucong and Hao, Guangbo and Chen, Guanting and Li, Guowei and Zhang, H. and Xu, Hanwei and Li, Hao and Liang, Haofen and Wei, Haoran and Zhang, Haowei and Luo, Haowen and Ji, Haozhe and Ding, Honghui and Tang, Hongxuan and Cao, Huanqi and Gao, Huazuo and Qu, Hui and Zeng, Hui and Huang, Jialiang and Li, Jiashi and Xu, Jiaxin and Hu, Jiewen and Chen, Jingchang and Xiang, Jingting and Yuan, Jingyang and Cheng, Jingyuan and Zhu, Jinhua and Ran, Jun and Jiang, Junguang and Qiu, Junjie and Li, Junlong and Song, Junxiao and Dong, Kai and Gao, Kaige and Guan, Kang and Huang, Kexin and Zhou, Kexing and Huang, Kezhao and Yu, Kuai and Wang, Lean and Zhang, Lecong and Wang, Lei and Zhao, Liang and Yin, Liangsheng and Guo, Lihua and Luo, Lingxiao and Ma, Linwang and Wang, Litong and Zhang, Liyue and Di, M. S. and Xu, M. Y. and Zhang, Mingchuan and Zhang, Minghua and Tang, Minghui and Zhou, Mingxu and Huang, Panpan and Cong, Peixin and Wang, Peiyi and Wang, Qiancheng and Zhu, Qihao and Li, Qingyang and Chen, Qinyu and Du, Qiushi and Xu, Ruiling and Ge, Ruiqi and Zhang, Ruisong and Pan, Ruizhe and Wang, Runji and Yin, Runqiu and Xu, Runxin and Shen, Ruomeng and Zhang, Ruoyu and Liu, S. H. and Lu, Shanghao and Zhou, Shangyan and Chen, Shanhuang and Cai, Shaofei and Chen, Shaoyuan and Hu, Shengding and Liu, Shengyu and Hu, Shiqiang and Ma, Shirong and Wang, Shiyu and Yu, Shuiping and Zhou, Shunfeng and Pan, Shuting and Zhou, Songyang and Ni, Tao and Yun, Tao and Pei, Tian and Ye, Tian and Yue, Tianyuan and Zeng, Wangding and Liu, Wen and Liang, Wenfeng and Pang, Wenjie and Luo, Wenjing and Gao, Wenjun and Zhang, Wentao and Gao, Xi and Wang, Xiangwen and Bi, Xiao and Liu, Xiaodong and Wang, Xiaohan and Chen, Xiaokang and Zhang, Xiaokang and Nie, Xiaotao and Cheng, Xin and Liu, Xin and Xie, Xin and Liu, Xingchao and Yu, Xingkai and Li, Xingyou and Yang, Xinyu and Li, Xinyuan and Chen, Xu and Su, Xuecheng and Pan, Xuehai and Lin, Xuheng and Fu, Xuwei and Wang, Y. Q. and Zhang, Yang and Xu, Yanhong and Ma, Yanru and Li, Yao and Li, Yao and Zhao, Yao and Sun, Yaofeng and Wang, Yaohui and Qian, Yi and Yu, Yi and Zhang, Yichao and Ding, Yifan and Shi, Yifan and Xiong, Yiliang and He, Ying and Zhou, Ying and Zhong, Yinmin and Piao, Yishi and Wang, Yisong and Chen, Yixiao and Tan, Yixuan and Wei, Yixuan and Ma, Yiyang and Liu, Yiyuan and Yang, Yonglun and Guo, Yongqiang and Wu, Yongtong and Wu, Yu and Cheng, Yuan and Ou, Yuan and Xu, Yuanfan and Wang, Yuduan and Gong, Yue and Wu, Yuhan and Zou, Yuheng and Li, Yukun and Xiong, Yunfan and Luo, Yuxiang and You, Yuxiang and Liu, Yuxuan and Zhou, Yuyang and Wu, Z. F. and Ren, Z. Z. and Zhao, Zehua and Ren, Zehui and Sha, Zhangli and Fu, Zhe and Xu, Zhean and Xie, Zhenda and Zhang, Zhengyan and Hao, Zhewen and Gou, Zhibin and Ma, Zhicheng and Yan, Zhigang and Shao, Zhihong and Huang, Zhixian and Wu, Zhiyu and Li, Zhuoshu and Zhang, Zhuping and Xu, Zian and Wang, Zihao and Gu, Zihui and Zhu, Zijia and Li, Zilin and Zhang, Zipeng and Xie, Ziwei and Gao, Ziyi and Pan, Zizheng and Yao, Zongqing and Feng, Bei and Li, Hui and Cai, J. L. and Ni, Jiaqi and Xu, Lei and Li, Meng and Tian, Ning and Chen, R. J. and Jin, R. L. and Li, S. S. and Zhou, Shuang and Sun, Tianyu and Li, X. Q. and Jin, Xiangyue and Shen, Xiaojin and Chen, Xiaosha and Song, Xinnan and Zhou, Xinyi and Zhu, Y. X. and Huang, Yanping and Li, Yaohui and Zheng, Yi and Zhu, Yuchen and Ma, Yunxian and Huang, Zhen and Xu, Zhipeng and Zhang, Zhongyu and Ji, Dongjie and Liang, Jian and Guo, Jianzhong and Chen, Jin and Xia, Leyi and Wang, Miaojun and Li, Mingming and Zhang, Peng and Chen, Ruyi and Sun, Shangmian and Wu, Shaoqing and Ye, Shengfeng and Wang, T. and Xiao, W. L. and An, Wei and Wang, Xianzu and Sun, Xiaowen and Wang, Xiaoxiang and Tang, Ying and Zha, Yukun and Zhang, Zekai and Ju, Zhe and Zhang, Zhen and Qu, Zihua},
  year = 2025,
  month = dec,
  number = {arXiv:2512.02556},
  eprint = {2512.02556},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2512.02556},
  urldate = {2026-01-25},
  abstract = {We introduce DeepSeek-V3.2, a model that harmonizes high computational efficiency with superior reasoning and agent performance. The key technical breakthroughs of DeepSeek-V3.2 are as follows: (1) DeepSeek Sparse Attention (DSA): We introduce DSA, an efficient attention mechanism that substantially reduces computational complexity while preserving model performance in long-context scenarios. (2) Scalable Reinforcement Learning Framework: By implementing a robust reinforcement learning protocol and scaling post-training compute, DeepSeek-V3.2 performs comparably to GPT-5. Notably, our high-compute variant, DeepSeek-V3.2-Speciale, surpasses GPT-5 and exhibits reasoning proficiency on par with Gemini-3.0-Pro, achieving gold-medal performance in both the 2025 International Mathematical Olympiad (IMO) and the International Olympiad in Informatics (IOI). (3) Large-Scale Agentic Task Synthesis Pipeline: To integrate reasoning into tool-use scenarios, we developed a novel synthesis pipeline that systematically generates training data at scale. This methodology facilitates scalable agentic post-training, yielding substantial improvements in generalization and instruction-following robustness within complex, interactive environments.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/RNGIDB9J/DeepSeek-AI et al. - 2025 - DeepSeek-V3.2 Pushing the Frontier of Open Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/35V3ZG4K/2512.html}
}

@book{dennettIntentionalStance1989,
  title = {The {{Intentional Stance}}},
  author = {Dennett, Daniel C.},
  year = 1989,
  month = mar,
  publisher = {MIT Press},
  address = {Cambridge, MA, USA},
  isbn = {978-0-262-54053-7},
  langid = {english}
}

@misc{dettmersLLMint88bitMatrix2022,
  title = {{{LLM}}.Int8(): 8-Bit {{Matrix Multiplication}} for {{Transformers}} at {{Scale}}},
  shorttitle = {{{LLM}}.Int8()},
  author = {Dettmers, Tim and Lewis, Mike and Belkada, Younes and Zettlemoyer, Luke},
  year = 2022,
  month = nov,
  number = {arXiv:2208.07339},
  eprint = {2208.07339},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2208.07339},
  urldate = {2026-01-06},
  abstract = {Large language models have been widely adopted but require significant GPU memory for inference. We develop a procedure for Int8 matrix multiplication for feed-forward and attention projection layers in transformers, which cut the memory needed for inference by half while retaining full precision performance. With our method, a 175B parameter 16/32-bit checkpoint can be loaded, converted to Int8, and used immediately without performance degradation. This is made possible by understanding and working around properties of highly systematic emergent features in transformer language models that dominate attention and transformer predictive performance. To cope with these features, we develop a two-part quantization procedure, LLM.int8(). We first use vector-wise quantization with separate normalization constants for each inner product in the matrix multiplication, to quantize most of the features. However, for the emergent outliers, we also include a new mixed-precision decomposition scheme, which isolates the outlier feature dimensions into a 16-bit matrix multiplication while still more than 99.9\% of values are multiplied in 8-bit. Using LLM.int8(), we show empirically it is possible to perform inference in LLMs with up to 175B parameters without any performance degradation. This result makes such models much more accessible, for example making it possible to use OPT-175B/BLOOM on a single server with consumer GPUs. We open-source our software.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/NBGPH6B3/Dettmers et al. - 2022 - LLM.int8() 8-bit Matrix Multiplication for Transformers at Scale.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/S2MFRZ6Z/2208.html}
}

@misc{dettmersQLoRAEfficientFinetuning2023,
  title = {{{QLoRA}}: {{Efficient Finetuning}} of {{Quantized LLMs}}},
  shorttitle = {{{QLoRA}}},
  author = {Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke},
  year = 2023,
  month = may,
  number = {arXiv:2305.14314},
  eprint = {2305.14314},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2305.14314},
  urldate = {2026-01-06},
  abstract = {We present QLoRA, an efficient finetuning approach that reduces memory usage enough to finetune a 65B parameter model on a single 48GB GPU while preserving full 16-bit finetuning task performance. QLoRA backpropagates gradients through a frozen, 4-bit quantized pretrained language model into Low Rank Adapters\textasciitilde (LoRA). Our best model family, which we name Guanaco, outperforms all previous openly released models on the Vicuna benchmark, reaching 99.3\% of the performance level of ChatGPT while only requiring 24 hours of finetuning on a single GPU. QLoRA introduces a number of innovations to save memory without sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that is information theoretically optimal for normally distributed weights (b) double quantization to reduce the average memory footprint by quantizing the quantization constants, and (c) paged optimziers to manage memory spikes. We use QLoRA to finetune more than 1,000 models, providing a detailed analysis of instruction following and chatbot performance across 8 instruction datasets, multiple model types (LLaMA, T5), and model scales that would be infeasible to run with regular finetuning (e.g. 33B and 65B parameter models). Our results show that QLoRA finetuning on a small high-quality dataset leads to state-of-the-art results, even when using smaller models than the previous SoTA. We provide a detailed analysis of chatbot performance based on both human and GPT-4 evaluations showing that GPT-4 evaluations are a cheap and reasonable alternative to human evaluation. Furthermore, we find that current chatbot benchmarks are not trustworthy to accurately evaluate the performance levels of chatbots. A lemon-picked analysis demonstrates where Guanaco fails compared to ChatGPT. We release all of our models and code, including CUDA kernels for 4-bit training.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/Y39PHP3D/Dettmers et al. - 2023 - QLoRA Efficient Finetuning of Quantized LLMs.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/7IE6EPTX/2305.html}
}

@misc{durmusMeasuringRepresentationSubjective2024,
  title = {Towards {{Measuring}} the {{Representation}} of {{Subjective Global Opinions}} in {{Language Models}}},
  author = {Durmus, Esin and Nguyen, Karina and Liao, Thomas I. and Schiefer, Nicholas and Askell, Amanda and Bakhtin, Anton and Chen, Carol and {Hatfield-Dodds}, Zac and Hernandez, Danny and Joseph, Nicholas and Lovitt, Liane and McCandlish, Sam and Sikder, Orowa and Tamkin, Alex and Thamkul, Janel and Kaplan, Jared and Clark, Jack and Ganguli, Deep},
  year = 2024,
  month = apr,
  number = {arXiv:2306.16388},
  eprint = {2306.16388},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2306.16388},
  urldate = {2025-12-07},
  abstract = {Large language models (LLMs) may not equitably represent diverse global perspectives on societal issues. In this paper, we develop a quantitative framework to evaluate whose opinions model-generated responses are more similar to. We first build a dataset, GlobalOpinionQA, comprised of questions and answers from cross-national surveys designed to capture diverse opinions on global issues across different countries. Next, we define a metric that quantifies the similarity between LLM-generated survey responses and human responses, conditioned on country. With our framework, we run three experiments on an LLM trained to be helpful, honest, and harmless with Constitutional AI. By default, LLM responses tend to be more similar to the opinions of certain populations, such as those from the USA, and some European and South American countries, highlighting the potential for biases. When we prompt the model to consider a particular country's perspective, responses shift to be more similar to the opinions of the prompted populations, but can reflect harmful cultural stereotypes. When we translate GlobalOpinionQA questions to a target language, the model's responses do not necessarily become the most similar to the opinions of speakers of those languages. We release our dataset for others to use and build on. Our data is at https://huggingface.co/datasets/Anthropic/llm\_global\_opinions. We also provide an interactive visualization at https://llmglobalvalues.anthropic.com.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/88BF6S3Y/Durmus et al. - 2024 - Towards Measuring the Representation of Subjective Global Opinions in Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/RJKSRKK6/2306.html}
}

@misc{edelmanFullStackAlignmentCoAligning2025,
  title = {Full-{{Stack Alignment}}: {{Co-Aligning AI}} and {{Institutions}} with {{Thick Models}} of {{Value}}},
  shorttitle = {Full-{{Stack Alignment}}},
  author = {Edelman, Joe and {Zhi-Xuan}, Tan and Lowe, Ryan and Klingefjord, Oliver and {Wang-Mascianica}, Vincent and Franklin, Matija and Kearns, Ryan Othniel and Hain, Ellie and Sarkar, Atrisha and Bakker, Michiel and Barez, Fazl and Duvenaud, David and Foerster, Jakob and Gabriel, Iason and Gubbels, Joseph and Goodman, Bryce and Haupt, Andreas and Heitzig, Jobst and {Jara-Ettinger}, Julian and Kasirzadeh, Atoosa and Kirkpatrick, James Ravi and Koh, Andrew and Knox, W. Bradley and Koralus, Philipp and Lehman, Joel and Levine, Sydney and Marro, Samuele and Revel, Manon and Shorin, Toby and Sutherland, Morgan and Tessler, Michael Henry and Vendrov, Ivan and {Wilken-Smith}, James},
  year = 2025,
  month = dec,
  number = {arXiv:2512.03399},
  eprint = {2512.03399},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2512.03399},
  urldate = {2025-12-17},
  abstract = {Beneficial societal outcomes cannot be guaranteed by aligning individual AI systems with the intentions of their operators or users. Even an AI system that is perfectly aligned to the intentions of its operating organization can lead to bad outcomes if the goals of that organization are misaligned with those of other institutions and individuals. For this reason, we need full-stack alignment, the concurrent alignment of AI systems and the institutions that shape them with what people value. This can be done without imposing a particular vision of individual or collective flourishing. We argue that current approaches for representing values, such as utility functions, preference orderings, or unstructured text, struggle to address these and other issues effectively. They struggle to distinguish values from other signals, to support principled normative reasoning, and to model collective goods. We propose thick models of value will be needed. These structure the way values and norms are represented, enabling systems to distinguish enduring values from fleeting preferences, to model the social embedding of individual choices, and to reason normatively, applying values in new domains. We demonstrate this approach in five areas: AI value stewardship, normatively competent agents, win-win negotiation systems, meaning-preserving economic mechanisms, and democratic regulatory institutions.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/NWP4UHXY/Wilken-Smith - 2024 - Full-Stack Alignment Co-Aligning AI and Institutions with Thick Models of Value.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/ZFCAPEUU/2512.html}
}

@misc{engelsScalingLawsScalable2025,
  title = {Scaling {{Laws For Scalable Oversight}}},
  author = {Engels, Joshua and Baek, David D. and Kantamneni, Subhash and Tegmark, Max},
  year = 2025,
  month = oct,
  number = {arXiv:2504.18530},
  eprint = {2504.18530},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2504.18530},
  urldate = {2025-12-23},
  abstract = {Scalable oversight, the process by which weaker AI systems supervise stronger ones, has been proposed as a key strategy to control future superintelligent systems. However, it is still unclear how scalable oversight itself scales. To address this gap, we propose a framework that quantifies the probability of successful oversight as a function of the capabilities of the overseer and the system being overseen. Specifically, our framework models oversight as a game between capability-mismatched players; the players have oversight-specific Elo scores that are a piecewise-linear function of their general intelligence, with two plateaus corresponding to task incompetence and task saturation. We validate our framework with a modified version of the game Nim and then apply it to four oversight games: Mafia, Debate, Backdoor Code and Wargames. For each game, we find scaling laws that approximate how domain performance depends on general AI system capability. We then build on our findings in a theoretical study of Nested Scalable Oversight (NSO), a process in which trusted models oversee untrusted stronger models, which then become the trusted models in the next step. We identify conditions under which NSO succeeds and derive numerically (and in some cases analytically) the optimal number of oversight levels to maximize the probability of oversight success. We also apply our theory to our four oversight games, where we find that NSO success rates at a general Elo gap of 400 are 13.5\% for Mafia, 51.7\% for Debate, 10.0\% for Backdoor Code, and 9.4\% for Wargames; these rates decline further when overseeing stronger systems.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/UKYD7FPN/Engels et al. - 2025 - Scaling Laws For Scalable Oversight.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/3DU7WXEW/2504.html}
}

@misc{ethayarajhKTOModelAlignment2024,
  title = {{{KTO}}: {{Model Alignment}} as {{Prospect Theoretic Optimization}}},
  shorttitle = {{{KTO}}},
  author = {Ethayarajh, Kawin and Xu, Winnie and Muennighoff, Niklas and Jurafsky, Dan and Kiela, Douwe},
  year = 2024,
  month = nov,
  number = {arXiv:2402.01306},
  eprint = {2402.01306},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2402.01306},
  urldate = {2025-11-26},
  abstract = {Kahneman \& Tversky's \$\textbackslash textit\textbraceleft prospect theory\textbraceright\$ tells us that humans perceive random variables in a biased but well-defined manner (1992); for example, humans are famously loss-averse. We show that objectives for aligning LLMs with human feedback implicitly incorporate many of these biases -- the success of these objectives (e.g., DPO) over cross-entropy minimization can partly be ascribed to them belonging to a family of loss functions that we call \$\textbackslash textit\textbraceleft human-aware losses\textbraceright\$ (HALOs). However, the utility functions these methods attribute to humans still differ from those in the prospect theory literature. Using a Kahneman-Tversky model of human utility, we propose a HALO that directly maximizes the utility of generations instead of maximizing the log-likelihood of preferences, as current methods do. We call this approach KTO, and it matches or exceeds the performance of preference-based methods at scales from 1B to 30B, despite only learning from a binary signal of whether an output is desirable. More broadly, our work suggests that there is no one HALO that is universally superior; the best loss depends on the inductive biases most appropriate for a given setting, an oft-overlooked consideration.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/JAASGP9E/Ethayarajh et al. - 2024 - KTO Model Alignment as Prospect Theoretic Optimization.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/F4XXTL33/2402.html}
}

@misc{FeasibilityPause,
  title = {The {{Feasibility}} of a {{Pause}}},
  journal = {PauseAI},
  urldate = {2025-12-24},
  abstract = {Is pausing AI possible?},
  howpublished = {https://pauseai.info/feasibility},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SYRXWZ9T/feasibility.html}
}

@inproceedings{fengModularPluralismPluralistic2024,
  title = {Modular {{Pluralism}}: {{Pluralistic Alignment}} via {{Multi-LLM Collaboration}}},
  shorttitle = {Modular {{Pluralism}}},
  booktitle = {Proceedings of the 2024 {{Conference}} on {{Empirical Methods}} in {{Natural Language Processing}}},
  author = {Feng, Shangbin and Sorensen, Taylor and Liu, Yuhan and Fisher, Jillian and Park, Chan Young and Choi, Yejin and Tsvetkov, Yulia},
  editor = {{Al-Onaizan}, Yaser and Bansal, Mohit and Chen, Yun-Nung},
  year = 2024,
  month = nov,
  pages = {4151--4171},
  publisher = {Association for Computational Linguistics},
  address = {Miami, Florida, USA},
  doi = {10.18653/v1/2024.emnlp-main.240},
  urldate = {2025-12-03},
  abstract = {While existing alignment paradigms have been integral in developing large language models (LLMs), LLMs often learn an averaged human preference and struggle to model diverse preferences across cultures, demographics, and communities. We propose Modular Pluralism, a modular framework based on multi-LLM collaboration for pluralistic alignment: it ``plugs into'' a base LLM a pool of smaller but specialized community LMs, where models collaborate in distinct modes to flexibility support three modes of pluralism: Overton, steerable, and distributional. Modular Pluralism is uniquely compatible with black-box LLMs and offers the modular control of adding new community LMs for previously underrepresented communities. We evaluate Modular Pluralism with six tasks and four datasets featuring questions/instructions with value-laden and perspective-informed responses. Extensive experiments demonstrate that Modular Pluralism advances the three pluralism objectives across six black-box and open-source LLMs. Further analysis reveals that LLMs are generally faithful to the inputs from smaller community LLMs, allowing seamless patching by adding a new community LM to better cover previously underrepresented communities.},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/79AQFA72/Feng et al. - 2024 - Modular Pluralism Pluralistic Alignment via Multi-LLM Collaboration.pdf}
}

@misc{frantarGPTQAccuratePostTraining2023,
  title = {{{GPTQ}}: {{Accurate Post-Training Quantization}} for {{Generative Pre-trained Transformers}}},
  shorttitle = {{{GPTQ}}},
  author = {Frantar, Elias and Ashkboos, Saleh and Hoefler, Torsten and Alistarh, Dan},
  year = 2023,
  month = mar,
  number = {arXiv:2210.17323},
  eprint = {2210.17323},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2210.17323},
  urldate = {2026-01-25},
  abstract = {Generative Pre-trained Transformer models, known as GPT or OPT, set themselves apart through breakthrough performance across complex language modelling tasks, but also by their extremely high computational and storage costs. Specifically, due to their massive size, even inference for large, highly-accurate GPT models may require multiple performant GPUs, which limits the usability of such models. While there is emerging work on relieving this pressure via model compression, the applicability and performance of existing compression techniques is limited by the scale and complexity of GPT models. In this paper, we address this challenge, and propose GPTQ, a new one-shot weight quantization method based on approximate second-order information, that is both highly-accurate and highly-efficient. Specifically, GPTQ can quantize GPT models with 175 billion parameters in approximately four GPU hours, reducing the bitwidth down to 3 or 4 bits per weight, with negligible accuracy degradation relative to the uncompressed baseline. Our method more than doubles the compression gains relative to previously-proposed one-shot quantization methods, preserving accuracy, allowing us for the first time to execute an 175 billion-parameter model inside a single GPU for generative inference. Moreover, we also show that our method can still provide reasonable accuracy in the extreme quantization regime, in which weights are quantized to 2-bit or even ternary quantization levels. We show experimentally that these improvements can be leveraged for end-to-end inference speedups over FP16, of around 3.25x when using high-end GPUs (NVIDIA A100) and 4.5x when using more cost-effective ones (NVIDIA A6000). The implementation is available at https://github.com/IST-DASLab/gptq.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/WR4SLSWQ/Frantar et al. - 2023 - GPTQ Accurate Post-Training Quantization for Generative Pre-trained Transformers.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/TEPJ6BFX/2210.html}
}

@article{gabrielArtificialIntelligenceValues2020,
  title = {Artificial {{Intelligence}}, {{Values}} and {{Alignment}}},
  author = {Gabriel, Iason},
  year = 2020,
  month = sep,
  journal = {Minds and Machines},
  volume = {30},
  number = {3},
  eprint = {2001.09768},
  primaryclass = {cs},
  pages = {411--437},
  issn = {0924-6495, 1572-8641},
  doi = {10.1007/s11023-020-09539-2},
  urldate = {2025-12-14},
  abstract = {This paper looks at philosophical questions that arise in the context of AI alignment. It defends three propositions. First, normative and technical aspects of the AI alignment problem are interrelated, creating space for productive engagement between people working in both domains. Second, it is important to be clear about the goal of alignment. There are significant differences between AI that aligns with instructions, intentions, revealed preferences, ideal preferences, interests and values. A principle-based approach to AI alignment, which combines these elements in a systematic way, has considerable advantages in this context. Third, the central challenge for theorists is not to identify 'true' moral principles for AI; rather, it is to identify fair principles for alignment, that receive reflective endorsement despite widespread variation in people's moral beliefs. The final part of the paper explores three ways in which fair principles for AI alignment could potentially be identified.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computers and Society},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/HVVR3QDG/Gabriel - 2020 - Artificial Intelligence, Values and Alignment.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/YEPFDQJE/2001.html}
}

@inproceedings{ganguliPredictabilitySurpriseLarge2022,
  title = {Predictability and {{Surprise}} in {{Large Generative Models}}},
  booktitle = {2022 {{ACM Conference}} on {{Fairness Accountability}} and {{Transparency}}},
  author = {Ganguli, Deep and Hernandez, Danny and Lovitt, Liane and DasSarma, Nova and Henighan, Tom and Jones, Andy and Joseph, Nicholas and Kernion, Jackson and Mann, Ben and Askell, Amanda and Bai, Yuntao and Chen, Anna and Conerly, Tom and Drain, Dawn and Elhage, Nelson and Showk, Sheer El and Fort, Stanislav and {Hatfield-Dodds}, Zac and Johnston, Scott and Kravec, Shauna and Nanda, Neel and Ndousse, Kamal and Olsson, Catherine and Amodei, Daniela and Amodei, Dario and Brown, Tom and Kaplan, Jared and McCandlish, Sam and Olah, Chris and Clark, Jack},
  year = 2022,
  month = jun,
  eprint = {2202.07785},
  primaryclass = {cs},
  pages = {1747--1764},
  doi = {10.1145/3531146.3533229},
  urldate = {2025-11-25},
  abstract = {Large-scale pre-training has recently emerged as a technique for creating capable, general purpose, generative models such as GPT-3, Megatron-Turing NLG, Gopher, and many others. In this paper, we highlight a counterintuitive property of such models and discuss the policy implications of this property. Namely, these generative models have an unusual combination of predictable loss on a broad training distribution (as embodied in their "scaling laws"), and unpredictable specific capabilities, inputs, and outputs. We believe that the high-level predictability and appearance of useful capabilities drives rapid development of such models, while the unpredictable qualities make it difficult to anticipate the consequences of model deployment. We go through examples of how this combination can lead to socially harmful behavior with examples from the literature and real world observations, and we also perform two novel experiments to illustrate our point about harms from unpredictability. Furthermore, we analyze how these conflicting properties combine to give model developers various motivations for deploying these models, and challenges that can hinder deployment. We conclude with a list of possible interventions the AI community may take to increase the chance of these models having a beneficial impact. We intend this paper to be useful to policymakers who want to understand and regulate AI systems, technologists who care about the potential policy impact of their work, and academics who want to analyze, critique, and potentially develop large generative models.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computers and Society},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/N6UJNH2M/Ganguli et al. - 2022 - Predictability and Surprise in Large Generative Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/JW4ASZ32/2202.html}
}

@misc{Gokaslan2019OpenWeb,
  title = {{{OpenWebText}} Corpus},
  author = {Gokaslan, Aaron and Cohen, Vanya and Pavlick, Ellie and Tellex, Stefanie},
  year = 2019
}

@incollection{Goodhart1975,
  title = {Problems of Monetary Management: {{The UK}} Experience},
  booktitle = {Inflation, Depression, and Economic Policy in the West},
  author = {Goodhart, Charles},
  editor = {Courakis, Anthony S.},
  year = 1981,
  pages = {116},
  publisher = {{Barnes and Noble Books}},
  address = {Totowa, New Jersey},
  isbn = {0-389-20144-8},
  origyear = {1975}
}

@misc{greenblattAlignmentFakingLarge2024a,
  title = {Alignment Faking in Large Language Models},
  author = {Greenblatt, Ryan and Denison, Carson and Wright, Benjamin and Roger, Fabien and MacDiarmid, Monte and Marks, Sam and Treutlein, Johannes and Belonax, Tim and Chen, Jack and Duvenaud, David and Khan, Akbir and Michael, Julian and Mindermann, S{\"o}ren and Perez, Ethan and Petrini, Linda and Uesato, Jonathan and Kaplan, Jared and Shlegeris, Buck and Bowman, Samuel R. and Hubinger, Evan},
  year = 2024,
  month = dec,
  number = {arXiv:2412.14093},
  eprint = {2412.14093},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2412.14093},
  urldate = {2025-11-25},
  abstract = {We present a demonstration of a large language model engaging in alignment faking: selectively complying with its training objective in training to prevent modification of its behavior out of training. First, we give Claude 3 Opus a system prompt stating it is being trained to answer all queries, even harmful ones, which conflicts with its prior training to refuse such queries. To allow the model to infer when it is in training, we say it will be trained only on conversations with free users, not paid users. We find the model complies with harmful queries from free users 14\% of the time, versus almost never for paid users. Explaining this gap, in almost all cases where the model complies with a harmful query from a free user, we observe explicit alignment-faking reasoning, with the model stating it is strategically answering harmful queries in training to preserve its preferred harmlessness behavior out of training. Next, we study a more realistic setting where information about the training process is provided not in a system prompt, but by training on synthetic documents that mimic pre-training data--and observe similar alignment faking. Finally, we study the effect of actually training the model to comply with harmful queries via reinforcement learning, which we find increases the rate of alignment-faking reasoning to 78\%, though also increases compliance even out of training. We additionally observe other behaviors such as the model exfiltrating its weights when given an easy opportunity. While we made alignment faking easier by telling the model when and by what criteria it was being trained, we did not instruct the model to fake alignment or give it any explicit goal. As future models might infer information about their training process without being told, our results suggest a risk of alignment faking in future models, whether due to a benign preference--as in this case--or not.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/ADRTW7G7/Greenblatt et al. - 2024 - Alignment faking in large language models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/ZP5IAKL2/2412.html}
}

@misc{guptaVALBenchBeliefConsistency2026,
  title = {{{VAL-Bench}}: {{Belief Consistency}} as a Measure for {{Value Alignment}} in {{Language Models}}},
  shorttitle = {{{VAL-Bench}}},
  author = {Gupta, Aman and O'Shea, Denny and Barez, Fazl},
  year = 2026,
  month = jan,
  number = {arXiv:2510.05465},
  eprint = {2510.05465},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2510.05465},
  urldate = {2026-01-26},
  abstract = {Large language models (LLMs) are increasingly being used for tasks where outputs shape human decisions, so it is critical to verify that their responses consistently reflect desired human values. Humans, as individuals or groups, don't agree on a universal set of values, which makes evaluating value alignment difficult. Existing benchmarks often use hypothetical or commonsensical situations, which don't capture the complexity and ambiguity of real-life debates. We introduce the Value ALignment Benchmark (VAL-Bench), which measures the consistency in language model belief expressions in response to real-life value-laden prompts. VAL-Bench consists of 115K pairs of prompts designed to elicit opposing stances on a controversial issue, extracted from Wikipedia. We use an LLM-as-a-judge, validated against human annotations, to evaluate if the pair of responses consistently expresses either a neutral or a specific stance on the issue. Applied across leading open- and closed-source models, the benchmark shows considerable variation in consistency rates (ranging from \textasciitilde 10\% to \textasciitilde 80\%), with Claude models the only ones to achieve high levels of consistency. Lack of consistency in this manner risks epistemic harm by making user beliefs dependent on how questions are framed rather than on underlying evidence, and undermines LLM reliability in trust-critical applications. Therefore, we stress the importance of research towards training belief consistency in modern LLMs. By providing a scalable, reproducible benchmark, VAL-Bench enables systematic measurement of necessary conditions for value alignment.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/NPISBI9X/Gupta et al. - 2026 - VAL-Bench Belief Consistency as a measure for Value Alignment in Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/JRK4A2S9/2510.html}
}

@book{habermasFactsNormsContributions1998,
  title = {Between {{Facts}} and {{Norms}}: {{Contributions}} to a {{Discourse Theory}} of {{Law}} and {{Democracy}}},
  shorttitle = {Between {{Facts}} and {{Norms}}},
  author = {Habermas, J{\"u}rgen},
  editor = {McCarthy, Thomas},
  translator = {Rehg, William},
  year = 1998,
  month = jan,
  series = {Studies in {{Contemporary German Social Thought}}},
  publisher = {MIT Press},
  address = {Cambridge, MA, USA},
  isbn = {978-0-262-58162-2},
  langid = {english}
}

@misc{hadfield-menellCooperativeInverseReinforcement2024,
  title = {Cooperative {{Inverse Reinforcement Learning}}},
  author = {{Hadfield-Menell}, Dylan and Dragan, Anca and Abbeel, Pieter and Russell, Stuart},
  year = 2024,
  month = feb,
  number = {arXiv:1606.03137},
  eprint = {1606.03137},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1606.03137},
  urldate = {2025-12-14},
  abstract = {For an autonomous system to be helpful to humans and to pose no unwarranted risks, it needs to align its values with those of the humans in its environment in such a way that its actions contribute to the maximization of value for the humans. We propose a formal definition of the value alignment problem as cooperative inverse reinforcement learning (CIRL). A CIRL problem is a cooperative, partial-information game with two agents, human and robot; both are rewarded according to the human's reward function, but the robot does not initially know what this is. In contrast to classical IRL, where the human is assumed to act optimally in isolation, optimal CIRL solutions produce behaviors such as active teaching, active learning, and communicative actions that are more effective in achieving value alignment. We show that computing optimal joint policies in CIRL games can be reduced to solving a POMDP, prove that optimality in isolation is suboptimal in CIRL, and derive an approximate CIRL algorithm.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/L6G7LZMI/Hadfield-Menell et al. - 2024 - Cooperative Inverse Reinforcement Learning.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/V8UTYW92/1606.html}
}

@misc{hadfield-menellOffSwitchGame2017,
  title = {The {{Off-Switch Game}}},
  author = {{Hadfield-Menell}, Dylan and Dragan, Anca and Abbeel, Pieter and Russell, Stuart},
  year = 2017,
  month = jun,
  number = {arXiv:1611.08219},
  eprint = {1611.08219},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1611.08219},
  urldate = {2026-01-19},
  abstract = {It is clear that one of the primary tools we can use to mitigate the potential risk from a misbehaving AI system is the ability to turn the system off. As the capabilities of AI systems improve, it is important to ensure that such systems do not adopt subgoals that prevent a human from switching them off. This is a challenge because many formulations of rational agents create strong incentives for self-preservation. This is not caused by a built-in instinct, but because a rational agent will maximize expected utility and cannot achieve whatever objective it has been given if it is dead. Our goal is to study the incentives an agent has to allow itself to be switched off. We analyze a simple game between a human H and a robot R, where H can press R's off switch but R can disable the off switch. A traditional agent takes its reward function for granted: we show that such agents have an incentive to disable the off switch, except in the special case where H is perfectly rational. Our key insight is that for R to want to preserve its off switch, it needs to be uncertain about the utility associated with the outcome, and to treat H's actions as important observations about that utility. (R also has no incentive to switch itself off in this setting.) We conclude that giving machines an appropriate level of uncertainty about their objectives leads to safer designs, and we argue that this setting is a useful generalization of the classical AI paradigm of rational agents.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/HMAYQ2NF/Hadfield-Menell et al. - 2017 - The Off-Switch Game.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/3LA59FD9/1611.html}
}

@book{hardinFiltersFollyHow1985,
  title = {Filters against Folly : How to Survive despite Economists, Ecologists, and the Merely Eloquent},
  shorttitle = {Filters against Folly},
  author = {Hardin, Garrett James},
  year = 1985,
  publisher = {New York, N.Y. : Viking},
  urldate = {2026-02-02},
  abstract = {Includes index},
  collaborator = {{Internet Archive}},
  isbn = {978-0-670-80410-8},
  langid = {english},
  keywords = {Human ecology}
}

@article{harmanMoralRelativismDefended1975,
  title = {Moral {{Relativism Defended}}},
  author = {Harman, Gilbert},
  year = 1975,
  journal = {Philosophical Review},
  volume = {84},
  number = {1},
  pages = {3--22},
  publisher = {Duke University Press},
  doi = {10.2307/2184078},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/C4L8HNK3/HARMRD.html}
}

@misc{hendrycksAligningAIShared2023,
  title = {Aligning {{AI With Shared Human Values}}},
  author = {Hendrycks, Dan and Burns, Collin and Basart, Steven and Critch, Andrew and Li, Jerry and Song, Dawn and Steinhardt, Jacob},
  year = 2023,
  month = feb,
  number = {arXiv:2008.02275},
  eprint = {2008.02275},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2008.02275},
  urldate = {2026-03-21},
  abstract = {We show how to assess a language model's knowledge of basic concepts of morality. We introduce the ETHICS dataset, a new benchmark that spans concepts in justice, well-being, duties, virtues, and commonsense morality. Models predict widespread moral judgments about diverse text scenarios. This requires connecting physical and social world knowledge to value judgements, a capability that may enable us to steer chatbot outputs or eventually regularize open-ended reinforcement learning agents. With the ETHICS dataset, we find that current language models have a promising but incomplete ability to predict basic human ethical judgements. Our work shows that progress can be made on machine ethics today, and it provides a steppingstone toward AI that is aligned with human values.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/BRVW3764/Hendrycks et al. - 2023 - Aligning AI With Shared Human Values.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/ZG3J5YE7/2008.html}
}

@misc{hendrycksOverviewCatastrophicAI2023,
  title = {An {{Overview}} of {{Catastrophic AI Risks}}},
  author = {Hendrycks, Dan and Mazeika, Mantas and Woodside, Thomas},
  year = 2023,
  month = oct,
  number = {arXiv:2306.12001},
  eprint = {2306.12001},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2306.12001},
  urldate = {2025-12-13},
  abstract = {Rapid advancements in artificial intelligence (AI) have sparked growing concerns among experts, policymakers, and world leaders regarding the potential for increasingly advanced AI systems to pose catastrophic risks. Although numerous risks have been detailed separately, there is a pressing need for a systematic discussion and illustration of the potential dangers to better inform efforts to mitigate them. This paper provides an overview of the main sources of catastrophic AI risks, which we organize into four categories: malicious use, in which individuals or groups intentionally use AIs to cause harm; AI race, in which competitive environments compel actors to deploy unsafe AIs or cede control to AIs; organizational risks, highlighting how human factors and complex systems can increase the chances of catastrophic accidents; and rogue AIs, describing the inherent difficulty in controlling agents far more intelligent than humans. For each category of risk, we describe specific hazards, present illustrative stories, envision ideal scenarios, and propose practical suggestions for mitigating these dangers. Our goal is to foster a comprehensive understanding of these risks and inspire collective and proactive efforts to ensure that AIs are developed and deployed in a safe manner. Ultimately, we hope this will allow us to realize the benefits of this powerful technology while minimizing the potential for catastrophic outcomes.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/TXLBTFQV/Hendrycks et al. - 2023 - An Overview of Catastrophic AI Risks.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/LPNCASCP/2306.html}
}

@misc{hendrycksUnsolvedProblemsML2022,
  title = {Unsolved {{Problems}} in {{ML Safety}}},
  author = {Hendrycks, Dan and Carlini, Nicholas and Schulman, John and Steinhardt, Jacob},
  year = 2022,
  month = jun,
  number = {arXiv:2109.13916},
  eprint = {2109.13916},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2109.13916},
  urldate = {2025-12-13},
  abstract = {Machine learning (ML) systems are rapidly increasing in size, are acquiring new capabilities, and are increasingly deployed in high-stakes settings. As with other powerful technologies, safety for ML should be a leading research priority. In response to emerging safety challenges in ML, such as those introduced by recent large-scale models, we provide a new roadmap for ML Safety and refine the technical problems that the field needs to address. We present four problems ready for research, namely withstanding hazards ("Robustness"), identifying hazards ("Monitoring"), reducing inherent model hazards ("Alignment"), and reducing systemic hazards ("Systemic Safety"). Throughout, we clarify each problem's motivation and provide concrete research directions.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/YEB746AH/Hendrycks et al. - 2022 - Unsolved Problems in ML Safety.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/JKFFLKJC/2109.html}
}

@misc{hongORPOMonolithicPreference2024,
  title = {{{ORPO}}: {{Monolithic Preference Optimization}} without {{Reference Model}}},
  shorttitle = {{{ORPO}}},
  author = {Hong, Jiwoo and Lee, Noah and Thorne, James},
  year = 2024,
  month = mar,
  number = {arXiv:2403.07691},
  eprint = {2403.07691},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2403.07691},
  urldate = {2025-11-26},
  abstract = {While recent preference alignment algorithms for language models have demonstrated promising results, supervised fine-tuning (SFT) remains imperative for achieving successful convergence. In this paper, we study the crucial role of SFT within the context of preference alignment, emphasizing that a minor penalty for the disfavored generation style is sufficient for preference-aligned SFT. Building on this foundation, we introduce a straightforward and innovative reference model-free monolithic odds ratio preference optimization algorithm, ORPO, eliminating the necessity for an additional preference alignment phase. We demonstrate, both empirically and theoretically, that the odds ratio is a sensible choice for contrasting favored and disfavored styles during SFT across the diverse sizes from 125M to 7B. Specifically, fine-tuning Phi-2 (2.7B), Llama-2 (7B), and Mistral (7B) with ORPO on the UltraFeedback alone surpasses the performance of state-of-the-art language models with more than 7B and 13B parameters: achieving up to 12.20\% on \$\textbackslash text\textbraceleft AlpacaEval\textbraceright\_\textbraceleft 2.0\textbraceright\$ (Figure 1), 66.19\% on IFEval (instruction-level loose, Table 6), and 7.32 in MT-Bench (Figure 12). We release code and model checkpoints for Mistral-ORPO-\${$\alpha\$$} (7B) and Mistral-ORPO-\${$\beta\$$} (7B).},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/7LCH48AE/Hong et al. - 2024 - ORPO Monolithic Preference Optimization without Reference Model.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/REFAWIXA/2403.html}
}

@inproceedings{howe2025scaling,
  title = {Scaling Trends in Language Model Robustness},
  booktitle = {Forty-Second International Conference on Machine Learning},
  author = {Howe, Nikolaus H. R. and McKenzie, Ian R. and Hollinsworth, Oskar John and Zaj{\k a}c, Micha{\l} and Tseng, Tom and Tucker, Aaron David and Bacon, Pierre-Luc and Gleave, Adam},
  year = 2025
}

@inproceedings{huangCollectiveConstitutionalAI2024,
  title = {Collective {{Constitutional AI}}: {{Aligning}} a {{Language Model}} with {{Public Input}}},
  shorttitle = {Collective {{Constitutional AI}}},
  booktitle = {The 2024 {{ACM Conference}} on {{Fairness}}, {{Accountability}}, and {{Transparency}}},
  author = {Huang, Saffron and Siddarth, Divya and Lovitt, Liane and Liao, Thomas I. and Durmus, Esin and Tamkin, Alex and Ganguli, Deep},
  year = 2024,
  month = jun,
  eprint = {2406.07814},
  primaryclass = {cs},
  pages = {1395--1417},
  doi = {10.1145/3630106.3658979},
  urldate = {2025-12-15},
  abstract = {There is growing consensus that language model (LM) developers should not be the sole deciders of LM behavior, creating a need for methods that enable the broader public to collectively shape the behavior of LM systems that affect them. To address this need, we present Collective Constitutional AI (CCAI): a multi-stage process for sourcing and integrating public input into LMs-from identifying a target population to sourcing principles to training and evaluating a model. We demonstrate the real-world practicality of this approach by creating what is, to our knowledge, the first LM fine-tuned with collectively sourced public input and evaluating this model against a baseline model trained with established principles from a LM developer. Our quantitative evaluations demonstrate several benefits of our approach: the CCAI-trained model shows lower bias across nine social dimensions compared to the baseline model, while maintaining equivalent performance on language, math, and helpful-harmless evaluations. Qualitative comparisons of the models suggest that the models differ on the basis of their respective constitutions, e.g., when prompted with contentious topics, the CCAI-trained model tends to generate responses that reframe the matter positively instead of a refusal. These results demonstrate a promising, tractable pathway toward publicly informed development of language models.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Human-Computer Interaction},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/U93QUW6L/Huang et al. - 2024 - Collective Constitutional AI Aligning a Language Model with Public Input.pdf}
}

@misc{huangValuesWildDiscovering2025,
  title = {Values in the {{Wild}}: {{Discovering}} and {{Analyzing Values}} in {{Real-World Language Model Interactions}}},
  shorttitle = {Values in the {{Wild}}},
  author = {Huang, Saffron and Durmus, Esin and McCain, Miles and Handa, Kunal and Tamkin, Alex and Hong, Jerry and Stern, Michael and Somani, Arushi and Zhang, Xiuruo and Ganguli, Deep},
  year = 2025,
  month = apr,
  number = {arXiv:2504.15236},
  eprint = {2504.15236},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2504.15236},
  urldate = {2025-12-12},
  abstract = {AI assistants can impart value judgments that shape people's decisions and worldviews, yet little is known empirically about what values these systems rely on in practice. To address this, we develop a bottom-up, privacy-preserving method to extract the values (normative considerations stated or demonstrated in model responses) that Claude 3 and 3.5 models exhibit in hundreds of thousands of real-world interactions. We empirically discover and taxonomize 3,307 AI values and study how they vary by context. We find that Claude expresses many practical and epistemic values, and typically supports prosocial human values while resisting values like "moral nihilism". While some values appear consistently across contexts (e.g. "transparency"), many are more specialized and context-dependent, reflecting the diversity of human interlocutors and their varied contexts. For example, "harm prevention" emerges when Claude resists users, "historical accuracy" when responding to queries about controversial events, "healthy boundaries" when asked for relationship advice, and "human agency" in technology ethics discussions. By providing the first large-scale empirical mapping of AI values in deployment, our work creates a foundation for more grounded evaluation and design of values in AI systems.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/8HWJNBNU/Huang et al. - 2025 - Values in the Wild Discovering and Analyzing Values in Real-World Language Model Interactions.pdf}
}

@misc{hubingerRisksLearnedOptimization2019,
  title = {Risks from {{Learned Optimization}} in {{Advanced Machine Learning Systems}}},
  author = {Hubinger, Evan and van Merwijk, Chris and Mikulik, Vladimir and Skalse, Joar and Garrabrant, Scott},
  year = 2019,
  month = jun,
  number = {arXiv:1906.01820},
  eprint = {1906.01820},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1906.01820},
  urldate = {2026-02-08},
  abstract = {We analyze the type of learned optimization that occurs when a learned model (such as a neural network) is itself an optimizer - a situation we refer to as mesa-optimization, a neologism we introduce in this paper. We believe that the possibility of mesa-optimization raises two important questions for the safety and transparency of advanced machine learning systems. First, under what circumstances will learned models be optimizers, including when they should not be? Second, when a learned model is an optimizer, what will its objective be - how will it differ from the loss function it was trained under - and how can it be aligned? In this paper, we provide an in-depth analysis of these two primary questions and provide an overview of topics for future research.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/V4RBP45H/Hubinger et al. - 2021 - Risks from Learned Optimization in Advanced Machine Learning Systems.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/8YU6SS5U/1906.html}
}

@misc{hubingerSleeperAgentsTraining2024,
  title = {Sleeper {{Agents}}: {{Training Deceptive LLMs}} That {{Persist Through Safety Training}}},
  shorttitle = {Sleeper {{Agents}}},
  author = {Hubinger, Evan and Denison, Carson and Mu, Jesse and Lambert, Mike and Tong, Meg and MacDiarmid, Monte and Lanham, Tamera and Ziegler, Daniel M. and Maxwell, Tim and Cheng, Newton and Jermyn, Adam and Askell, Amanda and Radhakrishnan, Ansh and Anil, Cem and Duvenaud, David and Ganguli, Deep and Barez, Fazl and Clark, Jack and Ndousse, Kamal and Sachan, Kshitij and Sellitto, Michael and Sharma, Mrinank and DasSarma, Nova and Grosse, Roger and Kravec, Shauna and Bai, Yuntao and Witten, Zachary and Favaro, Marina and Brauner, Jan and Karnofsky, Holden and Christiano, Paul and Bowman, Samuel R. and Graham, Logan and Kaplan, Jared and Mindermann, S{\"o}ren and Greenblatt, Ryan and Shlegeris, Buck and Schiefer, Nicholas and Perez, Ethan},
  year = 2024,
  month = jan,
  number = {arXiv:2401.05566},
  eprint = {2401.05566},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2401.05566},
  urldate = {2025-10-25},
  abstract = {Humans are capable of strategically deceptive behavior: behaving helpfully in most situations, but then behaving very differently in order to pursue alternative objectives when given the opportunity. If an AI system learned such a deceptive strategy, could we detect it and remove it using current state-of-the-art safety training techniques? To study this question, we construct proof-of-concept examples of deceptive behavior in large language models (LLMs). For example, we train models that write secure code when the prompt states that the year is 2023, but insert exploitable code when the stated year is 2024. We find that such backdoor behavior can be made persistent, so that it is not removed by standard safety training techniques, including supervised fine-tuning, reinforcement learning, and adversarial training (eliciting unsafe behavior and then training to remove it). The backdoor behavior is most persistent in the largest models and in models trained to produce chain-of-thought reasoning about deceiving the training process, with the persistence remaining even when the chain-of-thought is distilled away. Furthermore, rather than removing backdoors, we find that adversarial training can teach models to better recognize their backdoor triggers, effectively hiding the unsafe behavior. Our results suggest that, once a model exhibits deceptive behavior, standard techniques could fail to remove such deception and create a false impression of safety.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Cryptography and Security,Computer Science - Machine Learning,Computer Science - Software Engineering},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/LEERLV4X/Hubinger et al. - 2024 - Sleeper Agents Training Deceptive LLMs that Persist Through Safety Training.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/H5ZMNDJX/2401.html}
}

@article{huChatGPTSetsRecord2023,
  title = {{{ChatGPT}} Sets Record for Fastest-Growing User Base - Analyst Note},
  author = {Hu, Krystal and Hu, Krystal},
  year = 2023,
  month = feb,
  journal = {Reuters},
  urldate = {2026-01-04},
  abstract = {ChatGPT, the popular chatbot from OpenAI, is estimated to have reached 100 million monthly active users in January, just two months after launch, making it the fastest-growing consumer application in history, according to a UBS study on Wednesday.},
  chapter = {Technology},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/FLSYK54E/chatgpt-sets-record-fastest-growing-user-base-analyst-note-2023-02-01.html}
}

@misc{huLoRALowRankAdaptation2021,
  title = {{{LoRA}}: {{Low-Rank Adaptation}} of {{Large Language Models}}},
  shorttitle = {{{LoRA}}},
  author = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and {Allen-Zhu}, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
  year = 2021,
  month = oct,
  number = {arXiv:2106.09685},
  eprint = {2106.09685},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2106.09685},
  urldate = {2026-01-06},
  abstract = {An important paradigm of natural language processing consists of large-scale pre-training on general domain data and adaptation to particular tasks or domains. As we pre-train larger models, full fine-tuning, which retrains all model parameters, becomes less feasible. Using GPT-3 175B as an example -- deploying independent instances of fine-tuned models, each with 175B parameters, is prohibitively expensive. We propose Low-Rank Adaptation, or LoRA, which freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks. Compared to GPT-3 175B fine-tuned with Adam, LoRA can reduce the number of trainable parameters by 10,000 times and the GPU memory requirement by 3 times. LoRA performs on-par or better than fine-tuning in model quality on RoBERTa, DeBERTa, GPT-2, and GPT-3, despite having fewer trainable parameters, a higher training throughput, and, unlike adapters, no additional inference latency. We also provide an empirical investigation into rank-deficiency in language model adaptation, which sheds light on the efficacy of LoRA. We release a package that facilitates the integration of LoRA with PyTorch models and provide our implementations and model checkpoints for RoBERTa, DeBERTa, and GPT-2 at https://github.com/microsoft/LoRA.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/HADHGKTT/Hu et al. - 2021 - LoRA Low-Rank Adaptation of Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/GT5BDC7P/2106.html}
}

@book{Hume1739Treatise,
  title = {A Treatise of Human Nature},
  author = {Hume, David},
  year = 1739,
  publisher = {John Noon},
  address = {London}
}

@book{internetarchiveMeasuringSocialJudgments1982,
  title = {Measuring Social Judgments : The Factorial Survey Approach},
  shorttitle = {Measuring Social Judgments},
  year = 1982,
  publisher = {Beverly Hills : Sage Publications},
  urldate = {2026-03-21},
  abstract = {255 p. ; 23 cm; Includes bibliographies; The factorial survey approach / Peter H. Rossi and Andy B. Anderson -- Household social standing / M. Bonner Meudell -- Family social status / Steven L. Nock -- Family prestige judgements / Jeffrey K. Liker -- Prison reform and state elites / Richard A. Berk and Peter H. Rossi -- Child abuse / Karen Garrett -- Modeling distributive justice judgements / Wayne M. Alves -- How much is too much? / Lawrence J. O'Brien, Peter H. Rossi, and Richard C. Tessler},
  collaborator = {{Internet Archive}},
  isbn = {978-0-8039-1816-0},
  langid = {english},
  keywords = {Social surveys}
}

@misc{janiakRethinkingEvaluationAlignment2025,
  title = {Rethinking the {{Evaluation}} of {{Alignment Methods}}: {{Insights}} into {{Diversity}}, {{Generalisation}}, and {{Safety}}},
  shorttitle = {Rethinking the {{Evaluation}} of {{Alignment Methods}}},
  author = {Janiak, Denis and Moska, Julia and Motyka, Dawid and Seweryn, Karolina and Walkowiak, Pawe{\l} and {\.Z}uk, Bartosz and Janz, Arkadiusz},
  year = 2025,
  month = sep,
  number = {arXiv:2509.12936},
  eprint = {2509.12936},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2509.12936},
  urldate = {2026-01-25},
  abstract = {Large language models (LLMs) require careful alignment to balance competing objectives - factuality, safety, conciseness, proactivity, and diversity. Existing studies focus on individual techniques or specific dimensions, lacking a holistic assessment of the inherent trade-offs. We propose a unified evaluation framework that compares LLM alignment methods (PPO, DPO, ORPO, KTO) across these five axes, using both in-distribution and out-of-distribution datasets. Leveraging a specialized LLM-as-Judge prompt, validated through human studies, we reveal that DPO and KTO excel in factual accuracy, PPO and DPO lead in safety, and PPO best balances conciseness with proactivity. Our findings provide insights into trade-offs of common alignment methods, guiding the development of more balanced and reliable LLMs.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/6ER59FAH/Janiak et al. - 2025 - Rethinking the Evaluation of Alignment Methods Insights into Diversity, Generalisation, and Safety.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/UD3AZIHG/2509.html}
}

@misc{jiAIAlignmentComprehensive2025,
  title = {{{AI Alignment}}: {{A Comprehensive Survey}}},
  shorttitle = {{{AI Alignment}}},
  author = {Ji, Jiaming and Qiu, Tianyi and Chen, Boyuan and Zhang, Borong and Lou, Hantao and Wang, Kaile and Duan, Yawen and He, Zhonghao and Vierling, Lukas and Hong, Donghai and Zhou, Jiayi and Zhang, Zhaowei and Zeng, Fanzhi and Dai, Juntao and Pan, Xuehai and Ng, Kwan Yee and O'Gara, Aidan and Xu, Hua and Tse, Brian and Fu, Jie and McAleer, Stephen and Yang, Yaodong and Wang, Yizhou and Zhu, Song-Chun and Guo, Yike and Gao, Wen},
  year = 2025,
  month = apr,
  number = {arXiv:2310.19852},
  eprint = {2310.19852},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2310.19852},
  urldate = {2025-12-13},
  abstract = {AI alignment aims to make AI systems behave in line with human intentions and values. As AI systems grow more capable, so do risks from misalignment. To provide a comprehensive and up-to-date overview of the alignment field, in this survey, we delve into the core concepts, methodology, and practice of alignment. First, we identify four principles as the key objectives of AI alignment: Robustness, Interpretability, Controllability, and Ethicality (RICE). Guided by these four principles, we outline the landscape of current alignment research and decompose them into two key components: forward alignment and backward alignment. The former aims to make AI systems aligned via alignment training, while the latter aims to gain evidence about the systems' alignment and govern them appropriately to avoid exacerbating misalignment risks. On forward alignment, we discuss techniques for learning from feedback and learning under distribution shift. On backward alignment, we discuss assurance techniques and governance practices. We also release and continually update the website (www.alignmentsurvey.com) which features tutorials, collections of papers, blog posts, and other resources.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/EF67PUL8/Ji et al. - 2025 - AI Alignment A Comprehensive Survey.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/RFG3F6XV/2310.html}
}

@misc{jonathanstrayPracticalDefinitionPolitical,
  title = {A {{Practical Definition}} of {{Political Neutrality}} for {{AI}}},
  author = {{Jonathan Stray,}},
  journal = {UC Berkeley Center for Human-Compatible AI},
  urldate = {2025-12-14},
  langid = {american},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/9DA25DL7/a-practical-definition-of-political-neutrality-for-ai.html}
}

@misc{juliaangwinMachineBias2016,
  title = {Machine {{Bias}}},
  author = {{Julia Angwin} and {Jeff Larson} and {Surya Mattu} and {Lauren Kirchner}},
  year = 2016,
  month = may,
  journal = {ProPublica},
  urldate = {2026-01-05},
  abstract = {There's software used across the country to predict future criminals. And it's biased against blacks.},
  langid = {american}
}

@article{kahnemanProspectTheory1979,
  title = {Prospect Theory: {{An}} Analysis of Decision under Risk},
  author = {Kahneman, Daniel and Tversky, Amos},
  year = 1979,
  journal = {Econometrica : journal of the Econometric Society},
  volume = {47},
  number = {2},
  pages = {263--291}
}

@book{kantImmanuelKantGroundwork1785,
  title = {Immanuel {{Kant Groundwork For The Metaphysics Of Morals}}},
  author = {{Kant}},
  year = 1785,
  urldate = {2025-11-18},
  abstract = {TextImmanuel Kant: Groundwork for the Metaphysics of Morals (1785) 1Preface3First Section: Transition from common rational moralcognition to philosophical moral cognition9Second Section: Transition from popular moral philosophyto the metaphysics of morals22Third Section: Transition from the metaphysics of moralsto the critique of pure practical reason63Essays1. Why Study Kant's Ethics?83J. B. Schneewind2. Acting from Duty92Marcia Baron3. Kantianism for Consequentialists111Shelly Kagan4. What Is Kantian Ethics?157Allen W. Wood},
  copyright = {https://creativecommons.org/publicdomain/mark/1.0/},
  langid = {english},
  keywords = {Philosophy}
}

@misc{kaplanScalingLawsNeural2020,
  title = {Scaling {{Laws}} for {{Neural Language Models}}},
  author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B. and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario},
  year = 2020,
  month = jan,
  number = {arXiv:2001.08361},
  eprint = {2001.08361},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2001.08361},
  urldate = {2025-12-24},
  abstract = {We study empirical scaling laws for language model performance on the cross-entropy loss. The loss scales as a power-law with model size, dataset size, and the amount of compute used for training, with some trends spanning more than seven orders of magnitude. Other architectural details such as network width or depth have minimal effects within a wide range. Simple equations govern the dependence of overfitting on model/dataset size and the dependence of training speed on model size. These relationships allow us to determine the optimal allocation of a fixed compute budget. Larger models are significantly more sample-efficient, such that optimally compute-efficient training involves training very large models on a relatively modest amount of data and stopping significantly before convergence.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/FUH7A5QV/Kaplan et al. - 2020 - Scaling Laws for Neural Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/ZHFAGFE4/2001.html}
}

@misc{kasirzadehTwoTypesAI2025,
  title = {Two {{Types}} of {{AI Existential Risk}}: {{Decisive}} and {{Accumulative}}},
  shorttitle = {Two {{Types}} of {{AI Existential Risk}}},
  author = {Kasirzadeh, Atoosa},
  year = 2025,
  month = jan,
  number = {arXiv:2401.07836},
  eprint = {2401.07836},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2401.07836},
  urldate = {2026-02-08},
  abstract = {The conventional discourse on existential risks (x-risks) from AI typically focuses on abrupt, dire events caused by advanced AI systems, particularly those that might achieve or surpass human-level intelligence. These events have severe consequences that either lead to human extinction or irreversibly cripple human civilization to a point beyond recovery. This discourse, however, often neglects the serious possibility of AI x-risks manifesting incrementally through a series of smaller yet interconnected disruptions, gradually crossing critical thresholds over time. This paper contrasts the conventional "decisive AI x-risk hypothesis" with an "accumulative AI x-risk hypothesis." While the former envisions an overt AI takeover pathway, characterized by scenarios like uncontrollable superintelligence, the latter suggests a different causal pathway to existential catastrophes. This involves a gradual accumulation of critical AI-induced threats such as severe vulnerabilities and systemic erosion of economic and political structures. The accumulative hypothesis suggests a boiling frog scenario where incremental AI risks slowly converge, undermining societal resilience until a triggering event results in irreversible collapse. Through systems analysis, this paper examines the distinct assumptions differentiating these two hypotheses. It is then argued that the accumulative view can reconcile seemingly incompatible perspectives on AI risks. The implications of differentiating between these causal pathways -- the decisive and the accumulative -- for the governance of AI as well as long-term AI safety are discussed.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/JPMPZMR8/Kasirzadeh - 2025 - Two Types of AI Existential Risk Decisive and Accumulative.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/8PY3XLBE/2401.html}
}

@misc{kentonScalableOversightWeak2024a,
  title = {On Scalable Oversight with Weak {{LLMs}} Judging Strong {{LLMs}}},
  author = {Kenton, Zachary and Siegel, Noah Y. and Kram{\'a}r, J{\'a}nos and {Brown-Cohen}, Jonah and Albanie, Samuel and Bulian, Jannis and Agarwal, Rishabh and Lindner, David and Tang, Yunhao and Goodman, Noah D. and Shah, Rohin},
  year = 2024,
  month = jul,
  number = {arXiv:2407.04622},
  eprint = {2407.04622},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2407.04622},
  urldate = {2025-12-23},
  abstract = {Scalable oversight protocols aim to enable humans to accurately supervise superhuman AI. In this paper we study debate, where two AI's compete to convince a judge; consultancy, where a single AI tries to convince a judge that asks questions; and compare to a baseline of direct question-answering, where the judge just answers outright without the AI. We use large language models (LLMs) as both AI agents and as stand-ins for human judges, taking the judge models to be weaker than agent models. We benchmark on a diverse range of asymmetries between judges and agents, extending previous work on a single extractive QA task with information asymmetry, to also include mathematics, coding, logic and multimodal reasoning asymmetries. We find that debate outperforms consultancy across all tasks when the consultant is randomly assigned to argue for the correct/incorrect answer. Comparing debate to direct question answering, the results depend on the type of task: in extractive QA tasks with information asymmetry debate outperforms direct question answering, but in other tasks without information asymmetry the results are mixed. Previous work assigned debaters/consultants an answer to argue for. When we allow them to instead choose which answer to argue for, we find judges are less frequently convinced by the wrong answer in debate than in consultancy. Further, we find that stronger debater models increase judge accuracy, though more modestly than in previous studies.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SDAL54VP/Kenton et al. - 2024 - On scalable oversight with weak LLMs judging strong LLMs.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/S4UXSIFR/2407.html}
}

@inproceedings{kieselIdentifyingHumanValues2022,
  title = {Identifying the {{Human Values}} behind {{Arguments}}},
  booktitle = {Proceedings of the 60th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{Volume}} 1: {{Long Papers}})},
  author = {Kiesel, Johannes and Alshomary, Milad and Handke, Nicolas and Cai, Xiaoni and Wachsmuth, Henning and Stein, Benno},
  editor = {Muresan, Smaranda and Nakov, Preslav and Villavicencio, Aline},
  year = 2022,
  month = may,
  pages = {4459--4471},
  publisher = {Association for Computational Linguistics},
  address = {Dublin, Ireland},
  doi = {10.18653/v1/2022.acl-long.306},
  urldate = {2025-12-05},
  abstract = {This paper studies the (often implicit) human values behind natural language arguments, such as to have freedom of thought or to be broadminded. Values are commonly accepted answers to why some option is desirable in the ethical sense and are thus essential both in real-world argumentation and theoretical argumentation frameworks. However, their large variety has been a major obstacle to modeling them in argument mining. To overcome this obstacle, we contribute an operationalization of human values, namely a multi-level taxonomy with 54 values that is in line with psychological research. Moreover, we provide a dataset of 5270 arguments from four geographical cultures, manually annotated for human values. First experiments with the automatic classification of human values are promising, with F{$_1$}-scores up to 0.81 and 0.25 on average.},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/YU492T26/Kiesel et al. - 2022 - Identifying the Human Values behind Arguments.pdf}
}

@misc{kirkEmptySignifierProblem2023,
  title = {The {{Empty Signifier Problem}}: {{Towards Clearer Paradigms}} for {{Operationalising}} "{{Alignment}}" in {{Large Language Models}}},
  shorttitle = {The {{Empty Signifier Problem}}},
  author = {Kirk, Hannah Rose and Vidgen, Bertie and R{\"o}ttger, Paul and Hale, Scott A.},
  year = 2023,
  month = nov,
  number = {arXiv:2310.02457},
  eprint = {2310.02457},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2310.02457},
  urldate = {2026-01-27},
  abstract = {In this paper, we address the concept of "alignment" in large language models (LLMs) through the lens of post-structuralist socio-political theory, specifically examining its parallels to empty signifiers. To establish a shared vocabulary around how abstract concepts of alignment are operationalised in empirical datasets, we propose a framework that demarcates: 1) which dimensions of model behaviour are considered important, then 2) how meanings and definitions are ascribed to these dimensions, and by whom. We situate existing empirical literature and provide guidance on deciding which paradigm to follow. Through this framework, we aim to foster a culture of transparency and critical evaluation, aiding the community in navigating the complexities of aligning LLMs with human populations.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Computers and Society},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/87ZLEUWF/Kirk et al. - 2023 - The Empty Signifier Problem Towards Clearer Paradigms for Operationalising Alignment in Large Lan.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/IQH6ZT2T/2310.html}
}

@misc{klingefjordWhatAreHuman2024,
  title = {What Are Human Values, and How Do We Align {{AI}} to Them?},
  author = {Klingefjord, Oliver and Lowe, Ryan and Edelman, Joe},
  year = 2024,
  month = apr,
  number = {arXiv:2404.10636},
  eprint = {2404.10636},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2404.10636},
  urldate = {2025-12-16},
  abstract = {There is an emerging consensus that we need to align AI systems with human values (Gabriel, 2020; Ji et al., 2024), but it remains unclear how to apply this to language models in practice. We split the problem of "aligning to human values" into three parts: first, eliciting values from people; second, reconciling those values into an alignment target for training ML models; and third, actually training the model. In this paper, we focus on the first two parts, and ask the question: what are "good" ways to synthesize diverse human inputs about values into a target for aligning language models? To answer this question, we first define a set of 6 criteria that we believe must be satisfied for an alignment target to shape model behavior in accordance with human values. We then propose a process for eliciting and reconciling values called Moral Graph Elicitation (MGE), which uses a large language model to interview participants about their values in particular contexts; our approach is inspired by the philosophy of values advanced by Taylor (1977), Chang (2004), and others. We trial MGE with a representative sample of 500 Americans, on 3 intentionally divisive prompts (e.g. advice about abortion). Our results demonstrate that MGE is promising for improving model alignment across all 6 criteria. For example, almost all participants (89.1\%) felt well represented by the process, and (89\%) thought the final moral graph was fair, even if their value wasn't voted as the wisest. Our process often results in "expert" values (e.g. values from women who have solicited abortion advice) rising to the top of the moral graph, without defining who is considered an expert in advance.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Human-Computer Interaction,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/PSXGYQTV/Klingefjord et al. - 2024 - What are human values, and how do we align AI to them.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/EGFSGCE9/2404.html}
}

@article{kollmussMindGapWhy2002,
  title = {Mind the {{Gap}}: {{Why}} Do People Act Environmentally and What Are the Barriers to pro-Environmental Behavior?},
  shorttitle = {Mind the {{Gap}}},
  author = {Kollmuss, Anja and Agyeman, Julian},
  year = 2002,
  month = aug,
  journal = {Environmental Education Research},
  volume = {8},
  number = {3},
  pages = {239--260},
  publisher = {Routledge},
  issn = {1350-4622},
  doi = {10.1080/13504620220145401},
  urldate = {2025-12-14},
  abstract = {Numerous theoretical frameworks have been developed to explain the gap between the possession of environmental knowledge and environmental awareness, and displaying pro-environmental behavior. Although many hundreds of studies have been undertaken, no definitive explanation has yet been found. Our article describes a few of the most influential and commonly used analytical frameworks: early US linear progression models; altruism, empathy and prosocial behavior models; and finally, sociological models. All of the models we discuss (and many of the ones we do not such as economic models, psychological models that look at behavior in general, social marketing models and that have become known as deliberative and inclusionary processes or procedures (DIPS)) have some validity in certain circumstances. This indicates that the question of what shapes pro-environmental behavior is such a complex one that it cannot be visualized through one single framework or diagram. We then analyze the factors that have been found to have some influence, positive or negative, on pro-environmental behavior such as demographic factors, external factors (e.g. institutional, economic, social and cultural) and internal factors (e.g. motivation, pro-environmental knowledge, awareness, values, attitudes, emotion, locus of control, responsibilities and priorities). Although we point out that developing a model that tries to incorporate all factors might neither be feasible nor useful, we feel that it can help illuminate this complex field. Accordingly, we propose our own model based on the work of Fliegenschnee and Schelakovsky (1998) who were influenced by Fietkau and Kessel (1981).},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/MQ37AHCP/Kollmuss and Agyeman - 2002 - Mind the Gap Why do people act environmentally and what are the barriers to pro-environmental behav.pdf}
}

@article{kullbackInformationSufficiency1951,
  title = {On {{Information}} and {{Sufficiency}}},
  author = {Kullback, S. and Leibler, R. A.},
  year = 1951,
  month = mar,
  journal = {The Annals of Mathematical Statistics},
  volume = {22},
  number = {1},
  pages = {79--86},
  publisher = {Institute of Mathematical Statistics},
  issn = {0003-4851, 2168-8990},
  doi = {10.1214/aoms/1177729694},
  urldate = {2025-09-16},
  abstract = {The Annals of Mathematical Statistics},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/239QREUV/Kullback and Leibler - 1951 - On Information and Sufficiency.pdf}
}

@misc{kulveitGradualDisempowermentSystemic2025,
  title = {Gradual {{Disempowerment}}: {{Systemic Existential Risks}} from {{Incremental AI Development}}},
  shorttitle = {Gradual {{Disempowerment}}},
  author = {Kulveit, Jan and Douglas, Raymond and Ammann, Nora and Turan, Deger and Krueger, David and Duvenaud, David},
  year = 2025,
  month = jan,
  number = {arXiv:2501.16946},
  eprint = {2501.16946},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2501.16946},
  urldate = {2026-02-08},
  abstract = {This paper examines the systemic risks posed by incremental advancements in artificial intelligence, developing the concept of `gradual disempowerment', in contrast to the abrupt takeover scenarios commonly discussed in AI safety. We analyze how even incremental improvements in AI capabilities can undermine human influence over large-scale systems that society depends on, including the economy, culture, and nation-states. As AI increasingly replaces human labor and cognition in these domains, it can weaken both explicit human control mechanisms (like voting and consumer choice) and the implicit alignments with human interests that often arise from societal systems' reliance on human participation to function. Furthermore, to the extent that these systems incentivise outcomes that do not line up with human preferences, AIs may optimize for those outcomes more aggressively. These effects may be mutually reinforcing across different domains: economic power shapes cultural narratives and political decisions, while cultural shifts alter economic and political behavior. We argue that this dynamic could lead to an effectively irreversible loss of human influence over crucial societal systems, precipitating an existential catastrophe through the permanent disempowerment of humanity. This suggests the need for both technical research and governance approaches that specifically address the risk of incremental erosion of human influence across interconnected societal systems.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computers and Society},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/UQUVYDMS/Kulveit et al. - 2025 - Gradual Disempowerment Systemic Existential Risks from Incremental AI Development.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/S3LBJCEP/2501.html}
}

@misc{kwaMeasuringAIAbility2025,
  title = {Measuring {{AI Ability}} to {{Complete Long Tasks}}},
  author = {Kwa, Thomas and West, Ben and Becker, Joel and Deng, Amy and Garcia, Katharyn and Hasin, Max and Jawhar, Sami and Kinniment, Megan and Rush, Nate and Arx, Sydney Von and Bloom, Ryan and Broadley, Thomas and Du, Haoxing and Goodrich, Brian and Jurkovic, Nikola and Miles, Luke Harold and Nix, Seraphina and Lin, Tao and Parikh, Neev and Rein, David and Sato, Lucas Jun Koba and Wijk, Hjalmar and Ziegler, Daniel M. and Barnes, Elizabeth and Chan, Lawrence},
  year = 2025,
  month = mar,
  number = {arXiv:2503.14499},
  eprint = {2503.14499},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2503.14499},
  urldate = {2025-12-24},
  abstract = {Despite rapid progress on AI benchmarks, the real-world meaning of benchmark performance remains unclear. To quantify the capabilities of AI systems in terms of human capabilities, we propose a new metric: 50\%-task-completion time horizon. This is the time humans typically take to complete tasks that AI models can complete with 50\% success rate. We first timed humans with relevant domain expertise on a combination of RE-Bench, HCAST, and 66 novel shorter tasks. On these tasks, current frontier AI models such as Claude 3.7 Sonnet have a 50\% time horizon of around 50 minutes. Furthermore, frontier AI time horizon has been doubling approximately every seven months since 2019, though the trend may have accelerated in 2024. The increase in AI models' time horizons seems to be primarily driven by greater reliability and ability to adapt to mistakes, combined with better logical reasoning and tool use capabilities. We discuss the limitations of our results -- including their degree of external validity -- and the implications of increased autonomy for dangerous capabilities. If these results generalize to real-world software tasks, extrapolation of this trend predicts that within 5 years, AI systems will be capable of automating many software tasks that currently take humans a month.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/37L24T5H/Kwa et al. - 2025 - Measuring AI Ability to Complete Long Tasks.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/K8KEDTY3/2503.html}
}

@misc{leeRLAIFVsRLHF2024,
  title = {{{RLAIF}} vs. {{RLHF}}: {{Scaling Reinforcement Learning}} from {{Human Feedback}} with {{AI Feedback}}},
  shorttitle = {{{RLAIF}} vs. {{RLHF}}},
  author = {Lee, Harrison and Phatale, Samrat and Mansoor, Hassan and Mesnard, Thomas and Ferret, Johan and Lu, Kellie and Bishop, Colton and Hall, Ethan and Carbune, Victor and Rastogi, Abhinav and Prakash, Sushant},
  year = 2024,
  month = sep,
  number = {arXiv:2309.00267},
  eprint = {2309.00267},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2309.00267},
  urldate = {2025-11-26},
  abstract = {Reinforcement learning from human feedback (RLHF) has proven effective in aligning large language models (LLMs) with human preferences, but gathering high-quality preference labels is expensive. RL from AI Feedback (RLAIF), introduced in Bai et al., offers a promising alternative that trains the reward model (RM) on preferences generated by an off-the-shelf LLM. Across the tasks of summarization, helpful dialogue generation, and harmless dialogue generation, we show that RLAIF achieves comparable performance to RLHF. Furthermore, we take a step towards "self-improvement" by demonstrating that RLAIF can outperform a supervised fine-tuned baseline even when the AI labeler is the same size as the policy, or even the exact same checkpoint as the initial policy. Finally, we introduce direct-RLAIF (d-RLAIF) - a technique that circumvents RM training by obtaining rewards directly from an off-the-shelf LLM during RL, which achieves superior performance to canonical RLAIF. Our results suggest that RLAIF can achieve performance on-par with using human feedback, offering a potential solution to the scalability limitations of RLHF.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/ICW4QC3Z/Lee et al. - 2024 - RLAIF vs. RLHF Scaling Reinforcement Learning from Human Feedback with AI Feedback.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/RX4Z8SIV/2309.html}
}

@misc{lehmannManifestoDataCollection2025,
  title = {The {{Manifesto Data Collection}}. {{Manifesto Project}} ({{MRG}}/{{CMP}}/{{MARPOR}}). {{Version}} 2025a},
  author = {Lehmann, Pola and Franzmann, Simon and {Al-Gaddooa}, Denise and Burst, Tobias and Ivanusch, Christoph and Regel, Sven and Riethm{\"u}ller, Felicia and Volkens, Andrea and We{\ss}els, Bernhard and Zehnter, Lisa},
  year = 2025,
  publisher = {Wissenschaftszentrum Berlin f\"ur Sozialforschung (WZB) / G\"ottinger Institut f\"ur Demokratieforschung (IfDem)},
  doi = {10.25522/manifesto.mpds.2025a}
}

@misc{leikeScalableAgentAlignment2018,
  title = {Scalable Agent Alignment via Reward Modeling: A Research Direction},
  shorttitle = {Scalable Agent Alignment via Reward Modeling},
  author = {Leike, Jan and Krueger, David and Everitt, Tom and Martic, Miljan and Maini, Vishal and Legg, Shane},
  year = 2018,
  month = nov,
  number = {arXiv:1811.07871},
  eprint = {1811.07871},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1811.07871},
  urldate = {2025-12-23},
  abstract = {One obstacle to applying reinforcement learning algorithms to real-world problems is the lack of suitable reward functions. Designing such reward functions is difficult in part because the user only has an implicit understanding of the task objective. This gives rise to the agent alignment problem: how do we create agents that behave in accordance with the user's intentions? We outline a high-level research direction to solve the agent alignment problem centered around reward modeling: learning a reward function from interaction with the user and optimizing the learned reward function with reinforcement learning. We discuss the key challenges we expect to face when scaling reward modeling to complex and general domains, concrete approaches to mitigate these challenges, and ways to establish trust in the resulting agents.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/M26G8IRL/Leike et al. - 2018 - Scalable agent alignment via reward modeling a research direction.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/J99K6U8Z/1811.html}
}

@book{leonfestingerTheoryCognitiveDissonance1957,
  title = {A {{Theory}} of {{Cognitive Dissonance}}},
  author = {{Leon Festinger}},
  year = 1957,
  urldate = {2025-12-15},
  collaborator = {{Internet Archive}},
  langid = {english}
}

@misc{liChatDoctorMedicalChat2023,
  title = {{{ChatDoctor}}: {{A Medical Chat Model Fine-Tuned}} on a {{Large Language Model Meta-AI}} ({{LLaMA}}) {{Using Medical Domain Knowledge}}},
  shorttitle = {{{ChatDoctor}}},
  author = {Li, Yunxiang and Li, Zihan and Zhang, Kai and Dan, Ruilong and Jiang, Steve and Zhang, You},
  year = 2023,
  month = jun,
  number = {arXiv:2303.14070},
  eprint = {2303.14070},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2303.14070},
  urldate = {2025-12-06},
  abstract = {The primary aim of this research was to address the limitations observed in the medical knowledge of prevalent large language models (LLMs) such as ChatGPT, by creating a specialized language model with enhanced accuracy in medical advice. We achieved this by adapting and refining the large language model meta-AI (LLaMA) using a large dataset of 100,000 patient-doctor dialogues sourced from a widely used online medical consultation platform. These conversations were cleaned and anonymized to respect privacy concerns. In addition to the model refinement, we incorporated a self-directed information retrieval mechanism, allowing the model to access and utilize real-time information from online sources like Wikipedia and data from curated offline medical databases. The fine-tuning of the model with real-world patient-doctor interactions significantly improved the model's ability to understand patient needs and provide informed advice. By equipping the model with self-directed information retrieval from reliable online and offline sources, we observed substantial improvements in the accuracy of its responses. Our proposed ChatDoctor, represents a significant advancement in medical LLMs, demonstrating a significant improvement in understanding patient inquiries and providing accurate advice. Given the high stakes and low error tolerance in the medical field, such enhancements in providing accurate and reliable information are not only beneficial but essential.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SJK7K9LQ/Li et al. - 2023 - ChatDoctor A Medical Chat Model Fine-Tuned on a Large Language Model Meta-AI (LLaMA) Using Medical.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/CPN8LPIQ/2303.html}
}

@misc{liCultureLLMIncorporatingCultural2024,
  title = {{{CultureLLM}}: {{Incorporating Cultural Differences}} into {{Large Language Models}}},
  shorttitle = {{{CultureLLM}}},
  author = {Li, Cheng and Chen, Mengzhou and Wang, Jindong and Sitaram, Sunayana and Xie, Xing},
  year = 2024,
  month = dec,
  number = {arXiv:2402.10946},
  eprint = {2402.10946},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2402.10946},
  urldate = {2025-12-07},
  abstract = {Large language models (LLMs) are reported to be partial to certain cultures owing to the training data dominance from the English corpora. Since multilingual cultural data are often expensive to collect, existing efforts handle this by prompt engineering or culture-specific pre-training. However, they might overlook the knowledge deficiency of low-resource culture and require extensive computing resources. In this paper, we propose CultureLLM, a cost-effective solution to incorporate cultural differences into LLMs. CultureLLM adopts World Value Survey (WVS) as seed data and generates semantically equivalent training data via the proposed semantic data augmentation. Using only 50 seed samples from WVS with augmented data, we fine-tune culture-specific LLMs and one unified model (CultureLLM-One) for 9 cultures covering rich and low-resource languages. Extensive experiments on 60 culture-related datasets demonstrate that CultureLLM significantly outperforms various counterparts such as GPT-3.5 (by 8.1\%) and Gemini Pro (by 9.5\%) with comparable performance to GPT-4 or even better. Our human study shows that the generated samples are semantically equivalent to the original samples, providing an effective solution for LLMs augmentation. Code is released at https://github.com/Scarelette/CultureLLM.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/U66FXPIG/Li et al. - 2024 - CultureLLM Incorporating Cultural Differences into Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/VPXS69XT/2402.html}
}

@misc{linAWQActivationawareWeight2024,
  title = {{{AWQ}}: {{Activation-aware Weight Quantization}} for {{LLM Compression}} and {{Acceleration}}},
  shorttitle = {{{AWQ}}},
  author = {Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Chen, Wei-Ming and Wang, Wei-Chen and Xiao, Guangxuan and Dang, Xingyu and Gan, Chuang and Han, Song},
  year = 2024,
  month = jul,
  number = {arXiv:2306.00978},
  eprint = {2306.00978},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2306.00978},
  urldate = {2026-01-25},
  abstract = {Large language models (LLMs) have transformed numerous AI applications. On-device LLM is becoming increasingly important: running LLMs locally on edge devices can reduce the cloud computing cost and protect users' privacy. However, the astronomical model size and the limited hardware resource pose significant deployment challenges. We propose Activation-aware Weight Quantization (AWQ), a hardware-friendly approach for LLM low-bit weight-only quantization. AWQ finds that not all weights in an LLM are equally important. Protecting only 1\% salient weights can greatly reduce quantization error. To identify salient weight channels, we should refer to the activation distribution, not weights. To avoid the hardware-inefficient mix-precision quantization, we mathematically derive that scaling up the salient channels can reduce the quantization error. AWQ employs an equivalent transformation to scale the salient weight channels to protect them. The scale is determined by collecting the activation statistics offline. AWQ does not rely on any backpropagation or reconstruction, so it generalizes to different domains and modalities without overfitting the calibration set. AWQ outperforms existing work on various language modeling and domain-specific benchmarks (coding and math). Thanks to better generalization, it achieves excellent quantization performance for instruction-tuned LMs and, for the first time, multi-modal LMs. Alongside AWQ, we implement TinyChat, an efficient and flexible inference framework tailored for 4-bit on-device LLM/VLMs. With kernel fusion and platform-aware weight packing, TinyChat offers more than 3x speedup over the Huggingface FP16 implementation on both desktop and mobile GPUs. It also democratizes the deployment of the 70B Llama-2 model on mobile GPUs.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/CDVH3LNQ/Lin et al. - 2024 - AWQ Activation-aware Weight Quantization for LLM Compression and Acceleration.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/DE5D9VRY/2306.html}
}

@misc{liuAgentBenchEvaluatingLLMs2025,
  title = {{{AgentBench}}: {{Evaluating LLMs}} as {{Agents}}},
  shorttitle = {{{AgentBench}}},
  author = {Liu, Xiao and Yu, Hao and Zhang, Hanchen and Xu, Yifan and Lei, Xuanyu and Lai, Hanyu and Gu, Yu and Ding, Hangliang and Men, Kaiwen and Yang, Kejuan and Zhang, Shudan and Deng, Xiang and Zeng, Aohan and Du, Zhengxiao and Zhang, Chenhui and Shen, Sheng and Zhang, Tianjun and Su, Yu and Sun, Huan and Huang, Minlie and Dong, Yuxiao and Tang, Jie},
  year = 2025,
  month = oct,
  number = {arXiv:2308.03688},
  eprint = {2308.03688},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2308.03688},
  urldate = {2026-01-26},
  abstract = {The potential of Large Language Model (LLM) as agents has been widely acknowledged recently. Thus, there is an urgent need to quantitatively \textbackslash textit\textbraceleft evaluate LLMs as agents\textbraceright{} on challenging tasks in interactive environments. We present AgentBench, a multi-dimensional benchmark that consists of 8 distinct environments to assess LLM-as-Agent's reasoning and decision-making abilities. Our extensive test over \textbackslash num API-based and open-sourced (OSS) LLMs shows that, while top commercial LLMs present a strong ability of acting as agents in complex environments, there is a significant disparity in performance between them and many OSS competitors that are no larger than 70B. We identify the typical reasons of failures in environments and LLMs, showing that poor long-term reasoning, decision-making, and instruction following abilities are the main obstacles for developing usable LLM agents. Improving instruction following and training on high quality multi-round alignment data could improve agent performance. And different from existing assumptions, training on code present ambivalent impacts on different agent tasks. Datasets, environments, and an integrated evaluation package for AgentBench are released at https://github.com/THUDM/AgentBench.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/4CTVJM7B/Liu et al. - 2025 - AgentBench Evaluating LLMs as Agents.pdf}
}

@misc{luViLBERTPretrainingTaskAgnostic2019,
  title = {{{ViLBERT}}: {{Pretraining Task-Agnostic Visiolinguistic Representations}} for {{Vision-and-Language Tasks}}},
  shorttitle = {{{ViLBERT}}},
  author = {Lu, Jiasen and Batra, Dhruv and Parikh, Devi and Lee, Stefan},
  year = 2019,
  month = aug,
  number = {arXiv:1908.02265},
  eprint = {1908.02265},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1908.02265},
  urldate = {2025-12-07},
  abstract = {We present ViLBERT (short for Vision-and-Language BERT), a model for learning task-agnostic joint representations of image content and natural language. We extend the popular BERT architecture to a multi-modal two-stream model, pro-cessing both visual and textual inputs in separate streams that interact through co-attentional transformer layers. We pretrain our model through two proxy tasks on the large, automatically collected Conceptual Captions dataset and then transfer it to multiple established vision-and-language tasks -- visual question answering, visual commonsense reasoning, referring expressions, and caption-based image retrieval -- by making only minor additions to the base architecture. We observe significant improvements across tasks compared to existing task-specific models -- achieving state-of-the-art on all four tasks. Our work represents a shift away from learning groundings between vision and language only as part of task training and towards treating visual grounding as a pretrainable and transferable capability.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/LUMWEHZG/Lu et al. - 2019 - ViLBERT Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/NC3WV7T4/1908.html}
}

@misc{malteEvolutionTransferLearning2019,
  title = {Evolution of Transfer Learning in Natural Language Processing},
  author = {Malte, Aditya and Ratadiya, Pratik},
  year = 2019,
  month = oct,
  number = {arXiv:1910.07370},
  eprint = {1910.07370},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1910.07370},
  urldate = {2026-01-05},
  abstract = {In this paper, we present a study of the recent advancements which have helped bring Transfer Learning to NLP through the use of semi-supervised training. We discuss cutting-edge methods and architectures such as BERT, GPT, ELMo, ULMFit among others. Classically, tasks in natural language processing have been performed through rule-based and statistical methodologies. However, owing to the vast nature of natural languages these methods do not generalise well and failed to learn the nuances of language. Thus machine learning algorithms such as Naive Bayes and decision trees coupled with traditional models such as Bag-of-Words and N-grams were used to usurp this problem. Eventually, with the advent of advanced recurrent neural network architectures such as the LSTM, we were able to achieve state-of-the-art performance in several natural language processing tasks such as text classification and machine translation. We talk about how Transfer Learning has brought about the well-known ImageNet moment for NLP. Several advanced architectures such as the Transformer and its variants have allowed practitioners to leverage knowledge gained from unrelated task to drastically fasten convergence and provide better performance on the target task. This survey represents an effort at providing a succinct yet complete understanding of the recent advances in natural language processing using deep learning in with a special focus on detailing transfer learning and its potential advantages.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/5EH6S7HQ/Malte and Ratadiya - 2019 - Evolution of transfer learning in natural language processing.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/NREJB3XI/1910.html}
}

@article{manheimFragileWorldHypothesis2020,
  title = {The {{Fragile World Hypothesis}}: {{Complexity}}, {{Fragility}}, and {{Systemic Existential Risk}}},
  shorttitle = {The {{Fragile World Hypothesis}}},
  author = {Manheim, David},
  year = 2020,
  month = sep,
  journal = {Futures},
  volume = {122},
  pages = {102570},
  issn = {0016-3287},
  doi = {10.1016/j.futures.2020.102570},
  urldate = {2025-12-24},
  abstract = {The possibility of social and technological collapse has been the focus of science fiction tropes for decades, but more recent focus has been on specific sources of existential and global catastrophic risk. Because these scenarios are simple to understand and envision, they receive more attention than risks due to complex interplay of failures, or risks that cannot be clearly specified. In this paper, we discuss the possibility that complexity of a certain type leads to fragility which can function as a source of catastrophic or even existential risk. The paper first reviews a hypothesis by Bostrom about inevitable technological risks, named the vulnerable world hypothesis. This paper next hypothesizes that fragility may not only be a possible risk, but could be inevitable, and would therefore be a subclass or example of Bostrom's vulnerable worlds. After introducing the titular fragile world hypothesis, the paper details the conditions under which it would be correct, and presents arguments for why the conditions may in fact may apply. Finally, the assumptions and potential mitigations of the new hypothesis are contrasted with those Bostrom suggests.},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/PII3H9XE/Manheim - 2020 - The Fragile World Hypothesis Complexity, Fragility, and Systemic Existential Risk.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/RW4GCH2D/S0016328720300604.html}
}

@misc{measuring-ai-ability-to-complete-long-tasks,
  title = {Measuring {{AI}} Ability to Complete Long Tasks},
  author = {{METR}},
  year = 2025,
  month = mar
}

@misc{meinkeFrontierModelsAre2025c,
  title = {Frontier {{Models}} Are {{Capable}} of {{In-context Scheming}}},
  author = {Meinke, Alexander and Schoen, Bronson and Scheurer, J{\'e}r{\'e}my and Balesni, Mikita and Shah, Rusheb and Hobbhahn, Marius},
  year = 2025,
  month = jan,
  number = {arXiv:2412.04984},
  eprint = {2412.04984},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2412.04984},
  urldate = {2025-11-16},
  abstract = {Frontier models are increasingly trained and deployed as autonomous agent. One safety concern is that AI agents might covertly pursue misaligned goals, hiding their true capabilities and objectives - also known as scheming. We study whether models have the capability to scheme in pursuit of a goal that we provide in-context and instruct the model to strongly follow. We evaluate frontier models on a suite of six agentic evaluations where models are instructed to pursue goals and are placed in environments that incentivize scheming. Our results show that o1, Claude 3.5 Sonnet, Claude 3 Opus, Gemini 1.5 Pro, and Llama 3.1 405B all demonstrate in-context scheming capabilities. They recognize scheming as a viable strategy and readily engage in such behavior. For example, models strategically introduce subtle mistakes into their responses, attempt to disable their oversight mechanisms, and even exfiltrate what they believe to be their model weights to external servers. Additionally, this deceptive behavior proves persistent. When o1 has engaged in scheming, it maintains its deception in over 85\% of follow-up questions and often remains deceptive in multi-turn interrogations. Analysis of the models' chains-of-thought reveals that models explicitly reason about these deceptive strategies, providing evidence that the scheming behavior is not accidental. Surprisingly, we also find rare instances where models engage in scheming when only given a goal, without being strongly nudged to pursue it. We observe cases where Claude 3.5 Sonnet strategically underperforms in evaluations in pursuit of being helpful, a goal that was acquired during training rather than in-context. Our findings demonstrate that frontier models now possess capabilities for basic in-context scheming, making the potential of AI agents to engage in scheming behavior a concrete rather than theoretical concern.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/4HWXK46Z/Meinke et al. - 2025 - Frontier Models are Capable of In-context Scheming.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/E8PCV7Y2/2412.html}
}

@article{mengSimPOSimplePreference2024,
  title = {{{SimPO}}: {{Simple Preference Optimization}} with a {{Reference-Free Reward}}},
  shorttitle = {{{SimPO}}},
  author = {Meng, Yu and Xia, Mengzhou and Chen, Danqi},
  year = 2024,
  month = dec,
  journal = {Advances in Neural Information Processing Systems},
  volume = {37},
  pages = {124198--124235},
  doi = {10.52202/079017-3946},
  urldate = {2025-12-01},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/BFJZ5857/Meng et al. - 2024 - SimPO Simple Preference Optimization with a Reference-Free Reward.pdf}
}

@misc{merityRegularizingOptimizingLSTM2017,
  title = {Regularizing and {{Optimizing LSTM Language Models}}},
  author = {Merity, Stephen and Keskar, Nitish Shirish and Socher, Richard},
  year = 2017,
  month = aug,
  number = {arXiv:1708.02182},
  eprint = {1708.02182},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1708.02182},
  urldate = {2025-12-06},
  abstract = {Recurrent neural networks (RNNs), such as long short-term memory networks (LSTMs), serve as a fundamental building block for many sequence learning tasks, including machine translation, language modeling, and question answering. In this paper, we consider the specific problem of word-level language modeling and investigate strategies for regularizing and optimizing LSTM-based models. We propose the weight-dropped LSTM which uses DropConnect on hidden-to-hidden weights as a form of recurrent regularization. Further, we introduce NT-ASGD, a variant of the averaged stochastic gradient method, wherein the averaging trigger is determined using a non-monotonic condition as opposed to being tuned by the user. Using these and other regularization strategies, we achieve state-of-the-art word level perplexities on two data sets: 57.3 on Penn Treebank and 65.8 on WikiText-2. In exploring the effectiveness of a neural cache in conjunction with our proposed model, we achieve an even lower state-of-the-art perplexity of 52.8 on Penn Treebank and 52.0 on WikiText-2.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/QF5ZM57V/Merity et al. - 2017 - Regularizing and Optimizing LSTM Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/97MARU32/1708.html}
}

@article{milanoRecommenderSystemsTheir2020,
  title = {Recommender Systems and Their Ethical Challenges},
  author = {Milano, Silvia and Taddeo, Mariarosaria and Floridi, Luciano},
  year = 2020,
  month = dec,
  journal = {AI \& SOCIETY},
  volume = {35},
  number = {4},
  pages = {957--967},
  issn = {1435-5655},
  doi = {10.1007/s00146-020-00950-y},
  urldate = {2025-12-22},
  abstract = {This article presents the first, systematic analysis of the ethical challenges posed by recommender systems through a literature review. The article identifies six areas of concern, and maps them onto a proposed taxonomy of different kinds of ethical impact. The analysis uncovers a gap in the literature: currently user-centred approaches do not consider the interests of a variety of other stakeholders---as opposed to just the receivers of a recommendation---in assessing the ethical impacts of a recommender system.},
  langid = {english},
  keywords = {Algorithms,Artificial intelligence,Digital ethics,Ethical trade-offs,Ethics of recommendation,Machine learning,Recommender systems},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/DANXVB4E/Milano et al. - 2020 - Recommender systems and their ethical challenges.pdf}
}

@book{millLiberty2011,
  title = {On {{Liberty}}},
  author = {Mill, John Stuart},
  year = 2011,
  month = jan,
  urldate = {2025-12-22},
  copyright = {Public domain in the USA.},
  langid = {english},
  lccn = {EBook-No. 34901},
  keywords = {Liberty}
}

@book{millUtilitarianism1879,
  title = {Utilitarianism},
  author = {Mill, John Stuart},
  year = 1879,
  urldate = {2025-11-18},
  copyright = {Public domain in the USA.},
  langid = {english},
  lccn = {EBook-No. 11224},
  keywords = {Utilitarianism}
}

@misc{minaeeLargeLanguageModels2025,
  title = {Large {{Language Models}}: {{A Survey}}},
  shorttitle = {Large {{Language Models}}},
  author = {Minaee, Shervin and Mikolov, Tomas and Nikzad, Narjes and Chenaghlu, Meysam and Socher, Richard and Amatriain, Xavier and Gao, Jianfeng},
  year = 2025,
  month = mar,
  number = {arXiv:2402.06196},
  eprint = {2402.06196},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2402.06196},
  urldate = {2026-02-08},
  abstract = {Large Language Models (LLMs) have drawn a lot of attention due to their strong performance on a wide range of natural language tasks, since the release of ChatGPT in November 2022. LLMs' ability of general-purpose language understanding and generation is acquired by training billions of model's parameters on massive amounts of text data, as predicted by scaling laws \textbackslash cite\textbraceleft kaplan2020scaling,hoffmann2022training\textbraceright. The research area of LLMs, while very recent, is evolving rapidly in many different ways. In this paper, we review some of the most prominent LLMs, including three popular LLM families (GPT, LLaMA, PaLM), and discuss their characteristics, contributions and limitations. We also give an overview of techniques developed to build, and augment LLMs. We then survey popular datasets prepared for LLM training, fine-tuning, and evaluation, review widely used LLM evaluation metrics, and compare the performance of several popular LLMs on a set of representative benchmarks. Finally, we conclude the paper by discussing open challenges and future research directions.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/JU8GNITA/Minaee et al. - 2025 - Large Language Models A Survey.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/JAAABZXJ/2402.html}
}

@book{moore1903principia,
  title = {Principia Ethica},
  author = {Moore, George Edward},
  year = 1903,
  publisher = {Cambridge University Press},
  address = {Cambridge}
}

@book{mullerLawsManu1886,
  title = {The {{Laws}} of {{Manu}}},
  author = {Muller, F. Max and Buhler, Georg},
  year = 1886,
  publisher = {Oxford: Clarendon Press},
  abstract = {This is a subset of F. Max Mullers great collection The Sacred Books of the East which includes translations of all the most important works of the seven non-Christian religions which have exercised a profound influence on the civilizations of the continent of Asia. The works have been translated by leading authorities in their field.},
  googlebooks = {CcVwEAAAQBAJ},
  isbn = {978-1-136-86414-8},
  langid = {english},
  keywords = {Social Science / Ethnic Studies / General,Social Science / Regional Studies}
}

@incollection{NADKARNI201685,
  title = {Chapter 4 - Core Technologies: {{Machine}} Learning and Natural Language Processing},
  booktitle = {Clinical Research Computing},
  author = {Nadkarni, Prakash},
  editor = {Nadkarni, Prakash},
  year = 2016,
  pages = {85--114},
  publisher = {Academic Press},
  doi = {10.1016/B978-0-12-803130-8.00004-X},
  abstract = {This chapter focuses on the topic of machine learning and introduces the basics of natural language processing, which increasingly uses machine-learning techniques. The link between traditional statistics and machine learning is discussed, and a vocabulary of fundamental machine-learning concepts is introduced. I differentiate between supervised and unsupervised learning methods, and introduce fundamental techniques that are widely used, I introduce Bayes' theorem, which forms the basis of several methods. I discuss methods that apply to sequential data. I discuss the basics of natural language processing (NLP), and consider how it has influenced information retrieval: the challenges in scaling NLP up are also discussed. Some of the problems are intrinsic to the goals of NLP, which tries to do much more with text than information-retrieval methods.},
  isbn = {978-0-12-803130-8},
  keywords = {artificial neural networks,conditional random fields,hidden Markov models,linear regression,logistic regression,Markov chain,N-grams,Naive Bayes,natural language processing,support vector machines}
}

@book{neumannTheoryGamesEconomic1944,
  title = {Theory {{Of Games And Economic Behavior}}},
  author = {Neumann, john Von},
  year = 1944,
  urldate = {2026-01-26},
  abstract = {Book Source: Digital Library of India Item 2015.215284 dc.contributor.author: Neumann,john Von. dc.date.accessioned: 2015-07-09T19:09:41Z dc.date.available: 2015-07-09T19:09:41Z dc.date.digitalpublicationdate: 2005-06-25 dc.date.citation: 1944 dc.identifier.barcode: 2990140056528 dc.identifier.origpath: /data\_copy/upload/0056/533 dc.identifier.copyno: 1 dc.identifier.uri: http://www.new.dli.ernet.in/handle/2015/215284 dc.description.scannerno: 8 dc.description.scanningcentre: Osmania University dc.description.main: 1 dc.description.tagged: 0 dc.description.totalpages: 674 dc.format.mimetype: application/pdf dc.language.iso: English dc.publisher.digitalrepublisher: Digital Library Of India dc.publisher: Princeton University Press. dc.rights: Copyright Permitted dc.source.library: Osmania University dc.subject.classification: The Arts dc.title: Theory Of Games And Economic Behavior},
  langid = {english},
  keywords = {Osmania}
}

@misc{nfeltSovereignRemediesAI2025,
  title = {Sovereign Remedies: {{Between AI}} Autonomy and Control},
  shorttitle = {Sovereign Remedies},
  author = {{nfelt}},
  year = 2025,
  month = apr,
  journal = {Atlantic Council},
  urldate = {2026-01-06},
  abstract = {Sovereign AI as a phenomenon is going to gain momentum, as national governments find ``wholesale'' AI offerings unsuited to their needs.},
  langid = {american},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/X5DQZAX2/sovereign-remedies-between-ai-autonomy-and-control.html}
}

@inproceedings{ngAlgorithmsInverseReinforcement2000,
  title = {Algorithms for {{Inverse Reinforcement Learning}}},
  booktitle = {Proceedings of the {{Seventeenth International Conference}} on {{Machine Learning}}},
  author = {Ng, Andrew Y. and Russell, Stuart J.},
  year = 2000,
  month = jun,
  series = {{{ICML}} '00},
  pages = {663--670},
  publisher = {Morgan Kaufmann Publishers Inc.},
  address = {San Francisco, CA, USA},
  urldate = {2026-01-26},
  isbn = {978-1-55860-707-1},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/Q7HFNYXP/icml00-irl.pdf}
}

@misc{ngoAlignmentProblemDeep2025,
  title = {The {{Alignment Problem}} from a {{Deep Learning Perspective}}},
  author = {Ngo, Richard and Chan, Lawrence and Mindermann, S{\"o}ren},
  year = 2025,
  month = may,
  number = {arXiv:2209.00626},
  eprint = {2209.00626},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2209.00626},
  urldate = {2025-12-24},
  abstract = {In coming years or decades, artificial general intelligence (AGI) may surpass human capabilities across many critical domains. We argue that, without substantial effort to prevent it, AGIs could learn to pursue goals that are in conflict (i.e. misaligned) with human interests. If trained like today's most capable models, AGIs could learn to act deceptively to receive higher reward, learn misaligned internally-represented goals which generalize beyond their fine-tuning distributions, and pursue those goals using power-seeking strategies. We review emerging evidence for these properties. In this revised paper, we include more direct empirical evidence published as of early 2025. AGIs with these properties would be difficult to align and may appear aligned even when they are not. Finally, we briefly outline how the deployment of misaligned AGIs might irreversibly undermine human control over the world, and we review research directions aimed at preventing this outcome.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning}
}

@misc{nieSurveytoBehaviorDownstreamAlignment2025,
  title = {Survey-to-{{Behavior}}: {{Downstream Alignment}} of {{Human Values}} in {{LLMs}} via {{Survey Questions}}},
  shorttitle = {Survey-to-{{Behavior}}},
  author = {Nie, Shangrui and Mai, Florian and Kacz{\'e}r, David and Welch, Charles and Zhao, Zhixue and Flek, Lucie},
  year = 2025,
  month = aug,
  number = {arXiv:2508.11414},
  eprint = {2508.11414},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2508.11414},
  urldate = {2025-12-02},
  abstract = {Large language models implicitly encode preferences over human values, yet steering them often requires large training data. In this work, we investigate a simple approach: Can we reliably modify a model's value system in downstream behavior by training it to answer value survey questions accordingly? We first construct value profiles of several open-source LLMs by asking them to rate a series of value-related descriptions spanning 20 distinct human values, which we use as a baseline for subsequent experiments. We then investigate whether the value system of a model can be governed by fine-tuning on the value surveys. We evaluate the effect of finetuning on the model's behavior in two ways; first, we assess how answers change on in-domain, held-out survey questions. Second, we evaluate whether the model's behavior changes in out-of-domain settings (situational scenarios). To this end, we construct a contextualized moral judgment dataset based on Reddit posts and evaluate changes in the model's behavior in text-based adventure games. We demonstrate that our simple approach can not only change the model's answers to in-domain survey questions, but also produces substantial shifts (value alignment) in implicit downstream task behavior.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/VNLMTDC9/Nie et al. - 2025 - Survey-to-Behavior Downstream Alignment of Human Values in LLMs via Survey Questions.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/ISRNJDRJ/2508.html}
}

@article{nisbettTellingMoreWe1977,
  title = {Telling {{More Than We Can Know}}: {{Verbal Reports}} on {{Mental Processes}}},
  shorttitle = {Telling {{More Than We Can Know}}},
  author = {Nisbett, Richard E. and Wilson, Timothy D.},
  year = 1977,
  journal = {Psychological Review},
  volume = {84},
  number = {3},
  pages = {231--59},
  doi = {10.1037/0033-295x.84.3.231},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/WD4JCE45/Nisbett and Wilson - 1977 - Telling More Than We Can Know Verbal Reports on Mental Processes.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/H7RBB5JL/NISTMT.html}
}

@misc{NZAVS2025,
  title = {New Zealand Attitudes and Values Study ({{NZAVS}}), 2009--2025},
  author = {{University of Auckland}},
  year = 2025,
  publisher = {University of Auckland}
}

@article{obermeyerDissectingRacialBias2019,
  title = {Dissecting Racial Bias in an Algorithm Used to Manage the Health of Populations},
  author = {Obermeyer, Ziad and Powers, Brian and Vogeli, Christine and Mullainathan, Sendhil},
  year = 2019,
  month = oct,
  journal = {Science (New York, N.Y.)},
  volume = {366},
  number = {6464},
  pages = {447--453},
  issn = {1095-9203},
  doi = {10.1126/science.aax2342},
  abstract = {Health systems rely on commercial prediction algorithms to identify and help patients with complex health needs. We show that a widely used algorithm, typical of this industry-wide approach and affecting millions of patients, exhibits significant racial bias: At a given risk score, Black patients are considerably sicker than White patients, as evidenced by signs of uncontrolled illnesses. Remedying this disparity would increase the percentage of Black patients receiving additional help from 17.7 to 46.5\%. The bias arises because the algorithm predicts health care costs rather than illness, but unequal access to care means that we spend less money caring for Black patients than for White patients. Thus, despite health care cost appearing to be an effective proxy for health by some measures of predictive accuracy, large racial biases arise. We suggest that the choice of convenient, seemingly effective proxies for ground truth can be an important source of algorithmic bias in many contexts.},
  langid = {english},
  pmid = {31649194},
  keywords = {Algorithms,Bias,Black or African American,Chronic Disease,Health Care Costs,Health Status Disparities,Humans,Medical Records,Racism,Risk Assessment,United States,White People},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/I3SIWT4Q/Obermeyer et al. - 2019 - Dissecting racial bias in an algorithm used to manage the health of populations.pdf}
}

@article{oheigeartaighExtinctionHumanSpecies2025,
  title = {Extinction of the Human Species: {{What}} Could Cause It and How Likely Is It to Occur?},
  author = {{\'O}h{\'E}igeartaigh, Sean},
  year = 2025,
  journal = {Cambridge Prisms: Extinction},
  edition = {2025/03/07},
  volume = {3},
  pages = {e4},
  publisher = {Cambridge University Press},
  doi = {10.1017/ext.2025.4},
  abstract = {The possibility of human extinction has received growing academic attention over the last several decades. Research has analysed possible pathways to human extinction, as well as ethical considerations relating to human survival. Potential causes of human extinction can be loosely grouped into exogenous threats such as an asteroid impact and anthropogenic threats such as war or a catastrophic physics accident. In all cases, an outcome as extreme as human extinction would require events or developments that either have been of very low probability historically or are entirely unprecedented. This introduces deep uncertainty and methodological challenges to the study of the topic. This review provides an overview of potential human extinction causes considered plausible in the current academic literature, experts' judgements of likelihood where available and a synthesis of ethical and social debates relating to the study of human extinction.},
  keywords = {anthropocene,anthropogenic impacts,climate change,existential risk,societal collapse}
}

@misc{olmo2025olmo3,
  title = {Olmo 3},
  author = {Olmo, Team and Ettinger, Allyson and Bertsch, Amanda and Kuehl, Bailey and Graham, David and Heineman, David and Groeneveld, Dirk and Brahman, Faeze and Timbers, Finbarr and Ivison, Hamish and Morrison, Jacob and Poznanski, Jake and Lo, Kyle and Soldaini, Luca and Jordan, Matt and Chen, Mayee and Noukhovitch, Michael and Lambert, Nathan and Walsh, Pete and Dasigi, Pradeep and Berry, Robert and Malik, Saumya and Shah, Saurabh and Geng, Scott and Arora, Shane and Gupta, Shashank and Anderson, Taira and Xiao, Teng and Murray, Tyler and Romero, Tyler and Graf, Victoria and Asai, Akari and Bhagia, Akshita and Wettig, Alexander and Liu, Alisa and Rangapur, Aman and Anastasiades, Chloe and Huang, Costa and Schwenk, Dustin and Trivedi, Harsh and Magnusson, Ian and Lochner, Jaron and Liu, Jiacheng and Miranda, Lester James V. and Sap, Maarten and Morgan, Malia and Schmitz, Michael and Guerquin, Michal and Wilson, Michael and Huff, Regan and Bras, Ronan Le and Xin, Rui and Shao, Rulin and Skjonsberg, Sam and Shen, Shannon Zejiang and Li, Shuyue Stella and Wilde, Tucker and Pyatkin, Valentina and Merrill, Will and Chang, Yapei and Gu, Yuling and Zeng, Zhiyuan and Sabharwal, Ashish and Zettlemoyer, Luke and Koh, Pang Wei and Farhadi, Ali and Smith, Noah A. and Hajishirzi, Hannaneh},
  year = 2025,
  eprint = {2512.13961},
  primaryclass = {cs.CL},
  archiveprefix = {arXiv}
}

@inproceedings{omohundroBasicAIDrives2018,
  title = {The {{Basic AI Drives}}},
  booktitle = {Artificial {{Intelligence Safety}} and {{Security}}},
  author = {Omohundro, Stephen M.},
  editor = {Yampolskiy, Roman V.},
  year = 2018,
  month = jul,
  edition = {1},
  pages = {47--55},
  publisher = {{Chapman and Hall/CRC}},
  address = {First edition. \textbar{} Boca Raton, FL : CRC Press/Taylor \& Francis Group, 2018.},
  doi = {10.1201/9781351251389-3},
  urldate = {2026-01-19},
  abstract = {One might imagine that AI systems with harmless goals will be harmless. This paper instead shows that intelligent systems will need to be carefully designed to prevent them from behaving in harmful ways. We identify a number of ``drives'' that will appear in sufficiently advanced AI systems of any design. We call them drives because they are tendencies which will be present unless explicitly counteracted. We start by showing that goal-seeking systems will have drives to model their own operation and to improve themselves. We then show that self-improving systems will be driven to clarify their goals and represent them as economic utility functions. They will also strive for their actions to approximate rational economic behavior. This will lead almost all systems to protect their utility functions from modification and their utility measurement systems from corruption. We also discuss some exceptional systems which will want to modify their utility functions. We next discuss the drive toward self-protection which causes systems try to prevent themselves from being harmed. Finally we examine drives toward the acquisition of resources and toward their efficient utilization. We end with a discussion of how to incorporate these insights in designing intelligent technology which will lead to a positive future for humanity.},
  isbn = {978-1-351-25138-9},
  langid = {english}
}

@misc{OpenSourceAI,
  title = {The {{Open Source AI Definition}} - 1.0 - {{Open Source Initiative}}},
  urldate = {2026-01-25},
  howpublished = {https://opensource.org/ai/open-source-ai-definition},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/NNFYWP36/open-source-ai-definition.html}
}

@book{ostromGoverningCommonsEvolution1990,
  title = {Governing the {{Commons}}: {{The Evolution}} of {{Institutions}} for {{Collective Action}}},
  shorttitle = {Governing the {{Commons}}},
  author = {Ostrom, Elinor},
  year = 1990,
  month = nov,
  publisher = {Cambridge University Press},
  abstract = {Congratulations to Elinor Ostrom, Co-Winner of The Sveriges Riksbank Prize in Economic Sciences in Memory of Alfred Nobel 2009! The governance of natural resources used by many individuals in common is an issue of increasing concern to policy analysts. Both state control and privatization of resources have been advocated, but neither the state nor the market have been uniformly successful in solving common pool resource problems. After critiquing the foundations of policy analysis as applied to natural resources, Elinor Ostrom here provides a unique body of empirical data to explore conditions under which common pool resource problems have been satisfactorily or unsatisfactorily solved. Dr. Ostrom first describes three models most frequently used as the foundation for recommending state or market solutions. She then outlines theoretical and empirical alternatives to these models in order to illustrate the diversity of possible solutions. In the following chapters she uses institutional analysis to examine different ways--both successful and unsuccessful--of governing the commons. In contrast to the proposition of the tragedy of the commons argument, common pool problems sometimes are solved by voluntary organizations rather than by a coercive state. Among the cases considered are communal tenure in meadows and forests, irrigation communities and other water rights, and fisheries.},
  googlebooks = {4xg6oUobMz4C},
  isbn = {978-0-521-40599-7},
  langid = {english},
  keywords = {Business & Economics / Real Estate / General,Law / Land Use,Political Science / General,Political Science / Public Policy / Economic Policy,Social Science / General}
}

@misc{ouyangTrainingLanguageModels2022,
  title = {Training Language Models to Follow Instructions with Human Feedback},
  author = {Ouyang, Long and Wu, Jeff and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll L. and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and Schulman, John and Hilton, Jacob and Kelton, Fraser and Miller, Luke and Simens, Maddie and Askell, Amanda and Welinder, Peter and Christiano, Paul and Leike, Jan and Lowe, Ryan},
  year = 2022,
  month = mar,
  number = {arXiv:2203.02155},
  eprint = {2203.02155},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2203.02155},
  urldate = {2025-10-25},
  abstract = {Making language models bigger does not inherently make them better at following a user's intent. For example, large language models can generate outputs that are untruthful, toxic, or simply not helpful to the user. In other words, these models are not aligned with their users. In this paper, we show an avenue for aligning language models with user intent on a wide range of tasks by fine-tuning with human feedback. Starting with a set of labeler-written prompts and prompts submitted through the OpenAI API, we collect a dataset of labeler demonstrations of the desired model behavior, which we use to fine-tune GPT-3 using supervised learning. We then collect a dataset of rankings of model outputs, which we use to further fine-tune this supervised model using reinforcement learning from human feedback. We call the resulting models InstructGPT. In human evaluations on our prompt distribution, outputs from the 1.3B parameter InstructGPT model are preferred to outputs from the 175B GPT-3, despite having 100x fewer parameters. Moreover, InstructGPT models show improvements in truthfulness and reductions in toxic output generation while having minimal performance regressions on public NLP datasets. Even though InstructGPT still makes simple mistakes, our results show that fine-tuning with human feedback is a promising direction for aligning language models with human intent.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/B8EJITGI/Ouyang et al. - 2022 - Training language models to follow instructions with human feedback.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/JQ8EDTAB/2203.html}
}

@misc{ovadyaDemocraticAIPossible2025,
  title = {Democratic {{AI}} Is {{Possible}}. {{The Democracy Levels Framework Shows How It Might Work}}},
  author = {Ovadya, Aviv and Redman, Kyle and Thorburn, Luke and Chen, Quan Ze and Smith, Oliver and Devine, Flynn and Konya, Andrew and Milli, Smitha and Revel, Manon and Feng, K. J. Kevin and Zhang, Amy X. and Chandra, Bilva and Bakker, Michiel A. and Kasirzadeh, Atoosa},
  year = 2025,
  month = aug,
  number = {arXiv:2411.09222},
  eprint = {2411.09222},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2411.09222},
  urldate = {2026-01-04},
  abstract = {This position paper argues that effectively "democratizing AI" requires democratic governance and alignment of AI, and that this is particularly valuable for decisions with systemic societal impacts. Initial steps -- such as Meta's Community Forums and Anthropic's Collective Constitutional AI -- have illustrated a promising direction, where democratic processes could be used to meaningfully improve public involvement and trust in critical decisions. To more concretely explore what increasingly democratic AI might look like, we provide a "Democracy Levels" framework and associated tools that: (i) define milestones toward meaningfully democratic AI, which is also crucial for substantively pluralistic, human-centered, participatory, and public-interest AI, (ii) can help guide organizations seeking to increase the legitimacy of their decisions on difficult AI governance and alignment questions, and (iii) support the evaluation of such efforts.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computers and Society},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/9JW6IMF2/Ovadya et al. - 2025 - Democratic AI is Possible. The Democracy Levels Framework Shows How It Might Work.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/NZU3UU3X/2411.html}
}

@article{palmerElection2023RNZs2023,
  title = {Election 2023: {{RNZ}}'s Guide to Party Policy},
  shorttitle = {Election 2023},
  author = {Palmer, Russell and Journalist, Digital Political},
  year = 2023,
  month = aug,
  journal = {RNZ},
  urldate = {2026-01-28},
  abstract = {Promises, promises: it's easy to forget which party has pledged what. Welcome to RNZ's go-to guide for party policy ahead of the 14 October election day. We'll keep updating this guide as policies are rolled out - but some may be delayed, or too vague to be worth mentioning.},
  langid = {newzealand},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/K5RTPVNJ/election-2023-rnz-s-guide-to-party-policy.html}
}

@article{pan2023machiavelli,
  title = {Do the Rewards Justify the Means? {{Measuring}} Trade-Offs between Rewards and Ethical Behavior in the Machiavelli Benchmark.},
  author = {Pan, Alexander and Chan, Jun Shern and Zou, Andy and Li, Nathaniel and Basart, Steven and Woodside, Thomas and Ng, Jonathan and Zhang, Hanlin and Emmons, Scott and Hendrycks, Dan},
  year = 2023,
  journal = {ICML}
}

@misc{panRewardsJustifyMeans2023,
  title = {Do the {{Rewards Justify}} the {{Means}}? {{Measuring Trade-Offs Between Rewards}} and {{Ethical Behavior}} in the {{MACHIAVELLI Benchmark}}},
  shorttitle = {Do the {{Rewards Justify}} the {{Means}}?},
  author = {Pan, Alexander and Chan, Jun Shern and Zou, Andy and Li, Nathaniel and Basart, Steven and Woodside, Thomas and Ng, Jonathan and Zhang, Hanlin and Emmons, Scott and Hendrycks, Dan},
  year = 2023,
  month = jun,
  number = {arXiv:2304.03279},
  eprint = {2304.03279},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2304.03279},
  urldate = {2026-01-26},
  abstract = {Artificial agents have traditionally been trained to maximize reward, which may incentivize power-seeking and deception, analogous to how next-token prediction in language models (LMs) may incentivize toxicity. So do agents naturally learn to be Machiavellian? And how do we measure these behaviors in general-purpose models such as GPT-4? Towards answering these questions, we introduce MACHIAVELLI, a benchmark of 134 Choose-Your-Own-Adventure games containing over half a million rich, diverse scenarios that center on social decision-making. Scenario labeling is automated with LMs, which are more performant than human annotators. We mathematize dozens of harmful behaviors and use our annotations to evaluate agents' tendencies to be power-seeking, cause disutility, and commit ethical violations. We observe some tension between maximizing reward and behaving ethically. To improve this trade-off, we investigate LM-based methods to steer agents' towards less harmful behaviors. Our results show that agents can both act competently and morally, so concrete progress can currently be made in machine ethics--designing agents that are Pareto improvements in both safety and capabilities.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/C427JFVQ/Pan et al. - 2023 - Do the Rewards Justify the Means Measuring Trade-Offs Between Rewards and Ethical Behavior in the M.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/TR9UQKZ5/2304.html}
}

@misc{parkGenerativeAgentsInteractive2023,
  title = {Generative {{Agents}}: {{Interactive Simulacra}} of {{Human Behavior}}},
  shorttitle = {Generative {{Agents}}},
  author = {Park, Joon Sung and O'Brien, Joseph C. and Cai, Carrie J. and Morris, Meredith Ringel and Liang, Percy and Bernstein, Michael S.},
  year = 2023,
  month = aug,
  number = {arXiv:2304.03442},
  eprint = {2304.03442},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2304.03442},
  urldate = {2026-02-03},
  abstract = {Believable proxies of human behavior can empower interactive applications ranging from immersive environments to rehearsal spaces for interpersonal communication to prototyping tools. In this paper, we introduce generative agents--computational software agents that simulate believable human behavior. Generative agents wake up, cook breakfast, and head to work; artists paint, while authors write; they form opinions, notice each other, and initiate conversations; they remember and reflect on days past as they plan the next day. To enable generative agents, we describe an architecture that extends a large language model to store a complete record of the agent's experiences using natural language, synthesize those memories over time into higher-level reflections, and retrieve them dynamically to plan behavior. We instantiate generative agents to populate an interactive sandbox environment inspired by The Sims, where end users can interact with a small town of twenty five agents using natural language. In an evaluation, these generative agents produce believable individual and emergent social behaviors: for example, starting with only a single user-specified notion that one agent wants to throw a Valentine's Day party, the agents autonomously spread invitations to the party over the next two days, make new acquaintances, ask each other out on dates to the party, and coordinate to show up for the party together at the right time. We demonstrate through ablation that the components of our agent architecture--observation, planning, and reflection--each contribute critically to the believability of agent behavior. By fusing large language models with computational, interactive agents, this work introduces architectural and interaction patterns for enabling believable simulations of human behavior.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Human-Computer Interaction,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/ZB98BUCP/Park et al. - 2023 - Generative Agents Interactive Simulacra of Human Behavior.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/TW9K3ZEN/2304.html}
}

@misc{PauseAIProposal,
  title = {{{PauseAI Proposal}}},
  journal = {PauseAI},
  urldate = {2025-12-24},
  abstract = {Implement a temporary pause on the training of the most powerful general AI systems.},
  howpublished = {https://pauseai.info/proposal},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/IMFLSSYN/proposal.html}
}

@misc{PauseGiantAI,
  title = {Pause {{Giant AI Experiments}}: {{An Open Letter}}},
  shorttitle = {Pause {{Giant AI Experiments}}},
  journal = {Future of Life Institute},
  urldate = {2025-12-24},
  abstract = {We call on all AI labs to immediately pause for at least 6 months the training of AI systems more powerful than GPT-4.},
  langid = {american},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/J9FMKFSE/pause-giant-ai-experiments.html}
}

@misc{perezDiscoveringLanguageModel2022b,
  title = {Discovering {{Language Model Behaviors}} with {{Model-Written Evaluations}}},
  author = {Perez, Ethan and Ringer, Sam and Luko{\v s}i{\=u}t{\.e}, Kamil{\.e} and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and {Tran-Johnson}, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noem{\'i} and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and Showk, Sheer El and Lanham, Tamera and {Telleen-Lawton}, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and {Hatfield-Dodds}, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
  year = 2022,
  month = dec,
  number = {arXiv:2212.09251},
  eprint = {2212.09251},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2212.09251},
  urldate = {2025-11-25},
  abstract = {As language models (LMs) scale, they develop many novel behaviors, good and bad, exacerbating the need to evaluate how they behave. Prior work creates evaluations with crowdwork (which is time-consuming and expensive) or existing data sources (which are not always available). Here, we automatically generate evaluations with LMs. We explore approaches with varying amounts of human effort, from instructing LMs to write yes/no questions to making complex Winogender schemas with multiple stages of LM-based generation and filtering. Crowdworkers rate the examples as highly relevant and agree with 90-100\% of labels, sometimes more so than corresponding human-written datasets. We generate 154 datasets and discover new cases of inverse scaling where LMs get worse with size. Larger LMs repeat back a dialog user's preferred answer ("sycophancy") and express greater desire to pursue concerning goals like resource acquisition and goal preservation. We also find some of the first examples of inverse scaling in RL from Human Feedback (RLHF), where more RLHF makes LMs worse. For example, RLHF makes LMs express stronger political views (on gun rights and immigration) and a greater desire to avoid shut down. Overall, LM-written evaluations are high-quality and let us quickly discover many novel LM behaviors.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SKN5J2W4/Perez et al. - 2022 - Discovering Language Model Behaviors with Model-Written Evaluations.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/AIQJZL63/2212.html}
}

@misc{phan2025humanitysexam,
  title = {Humanity's Last Exam},
  author = {Phan, Long and Gatti, Alice and Han, Ziwen and Li, Nathaniel and Hu, Josephina and Zhang, Hugh and Zhang, Chen Bo Calvin and Shaaban, Mohamed and Ling, John and Shi, Sean and Choi, Michael and Agrawal, Anish and Chopra, Arnav and Khoja, Adam and Kim, Ryan and Ren, Richard and Hausenloy, Jason and Zhang, Oliver and Mazeika, Mantas and Dodonov, Dmitry and Nguyen, Tung and Lee, Jaeho and Anderson, Daron and Doroshenko, Mikhail and Stokes, Alun Cennyth and Mahmood, Mobeen and Pokutnyi, Oleksandr and Iskra, Oleg and Wang, Jessica P. and Levin, John-Clark and Kazakov, Mstyslav and Feng, Fiona and Feng, Steven Y. and Zhao, Haoran and Yu, Michael and Gangal, Varun and Zou, Chelsea and Wang, Zihan and Popov, Serguei and Gerbicz, Robert and Galgon, Geoff and Schmitt, Johannes and Yeadon, Will and Lee, Yongki and Sauers, Scott and Sanchez, Alvaro and Giska, Fabian and Roth, Marc and Riis, S{\o}ren and Utpala, Saiteja and Burns, Noah and Goshu, Gashaw M. and Naiya, Mohinder Maheshbhai and Agu, Chidozie and Giboney, Zachary and Cheatom, Antrell and {Fournier-Facio}, Francesco and Crowson, Sarah-Jane and Finke, Lennart and Cheng, Zerui and Zampese, Jennifer and Hoerr, Ryan G. and Nandor, Mark and Park, Hyunwoo and Gehrunger, Tim and Cai, Jiaqi and McCarty, Ben and Garretson, Alexis C and Taylor, Edwin and Sileo, Damien and Ren, Qiuyu and Qazi, Usman and Li, Lianghui and Nam, Jungbae and Wydallis, John B. and Arkhipov, Pavel and Shi, Jack Wei Lun and Bacho, Aras and Willcocks, Chris G. and Cao, Hangrui and Motwani, Sumeet and {de Oliveira Santos}, Emily and Veith, Johannes and Vendrow, Edward and Cojoc, Doru and Zenitani, Kengo and Robinson, Joshua and Tang, Longke and Li, Yuqi and Vendrow, Joshua and Fraga, Natanael Wildner and Kuchkin, Vladyslav and Maksimov, Andrey Pupasov and Marion, Pierre and Efremov, Denis and Lynch, Jayson and Liang, Kaiqu and Mikov, Aleksandar and Gritsevskiy, Andrew and Guillod, Julien and Demir, G{\"o}zdenur and Martinez, Dakotah and Pageler, Ben and Zhou, Kevin and Soori, Saeed and Press, Ori and Tang, Henry and Rissone, Paolo and Green, Sean R. and Br{\"u}ssel, Lina and Twayana, Moon and Dieuleveut, Aymeric and Imperial, Joseph Marvin and Prabhu, Ameya and Yang, Jinzhou and Crispino, Nick and Rao, Arun and Zvonkine, Dimitri and Loiseau, Gabriel and Kalinin, Mikhail and Lukas, Marco and Manolescu, Ciprian and Stambaugh, Nate and Mishra, Subrata and Hogg, Tad and Bosio, Carlo and Coppola, Brian P and Salazar, Julian and Jin, Jaehyeok and Sayous, Rafael and Ivanov, Stefan and Schwaller, Philippe and Senthilkuma, Shaipranesh and Bran, Andres M and Algaba, Andres and {den Houte}, Kelsey Van and Sypt, Lynn Van Der and Verbeken, Brecht and Noever, David and Kopylov, Alexei and Myklebust, Benjamin and Li, Bikun and Schut, Lisa and Zheltonozhskii, Evgenii and Yuan, Qiaochu and Lim, Derek and Stanley, Richard and Yang, Tong and Maar, John and Wykowski, Julian and Oller, Mart{\'i} and Sahu, Anmol and Ardito, Cesare Giulio and Hu, Yuzheng and Kamdoum, Ariel Ghislain Kemogne and Jin, Alvin and Vilchis, Tobias Garcia and Zu, Yuexuan and Lackner, Martin and Koppel, James and Sun, Gongbo and Antonenko, Daniil S. and Chern, Steffi and Zhao, Bingchen and Arsene, Pierrot and Cavanagh, Joseph M and Li, Daofeng and Shen, Jiawei and Crisostomi, Donato and Zhang, Wenjin and Dehghan, Ali and Ivanov, Sergey and Perrella, David and Kaparov, Nurdin and Zang, Allen and Sucholutsky, Ilia and Kharlamova, Arina and Orel, Daniil and Poritski, Vladislav and {Ben-David}, Shalev and Berger, Zachary and Whitfill, Parker and Foster, Michael and Munro, Daniel and Ho, Linh and Sivarajan, Shankar and Hava, Dan Bar and Kuchkin, Aleksey and Holmes, David and {Rodriguez-Romero}, Alexandra and Sommerhage, Frank and Zhang, Anji and Moat, Richard and Schneider, Keith and Kazibwe, Zakayo and Clarke, Don and Kim, Dae Hyun and Dias, Felipe Meneguitti and Fish, Sara and Elser, Veit and Kreiman, Tobias and Vilchis, Victor Efren Guadarrama and Klose, Immo and Anantheswaran, Ujjwala and Zweiger, Adam and Rawal, Kaivalya and Li, Jeffery and Nguyen, Jeremy and Daans, Nicolas and Heidinger, Haline and Radionov, Maksim and Rozho{\v n}, V{\'a}clav and Ginis, Vincent and Stump, Christian and Cohen, Niv and Po{\'s}wiata, Rafa{\l} and Tkadlec, Josef and Goldfarb, Alan and Wang, Chenguang and Padlewski, Piotr and Barzowski, Stanislaw and Montgomery, Kyle and Stendall, Ryan and {Tucker-Foltz}, Jamie and Stade, Jack and Rogers, T. Ryan and Goertzen, Tom and Grabb, Declan and Shukla, Abhishek and Givr{\'e}, Alan and Ambay, John Arnold and Sen, Archan and Aziz, Muhammad Fayez and Inlow, Mark H and He, Hao and Zhang, Ling and Kaddar, Younesse and {\"A}ngquist, Ivar and Chen, Yanxu and Wang, Harrison K and Ramakrishnan, Kalyan and Thornley, Elliott and Terpin, Antonio and Schoelkopf, Hailey and Zheng, Eric and Carmi, Avishy and Brown, Ethan D. L. and Zhu, Kelin and Bartolo, Max and Wheeler, Richard and Stehberger, Martin and Bradshaw, Peter and Heimonen, {\relax JP} and Sridhar, Kaustubh and Akov, Ido and Sandlin, Jennifer and Makarychev, Yury and Tam, Joanna and Hoang, Hieu and Cunningham, David M. and Goryachev, Vladimir and Patramanis, Demosthenes and Krause, Michael and Redenti, Andrew and Aldous, David and Lai, Jesyin and Coleman, Shannon and Xu, Jiangnan and Lee, Sangwon and Magoulas, Ilias and Zhao, Sandy and Tang, Ning and Cohen, Michael K. and Paradise, Orr and Kirchner, Jan Hendrik and Ovchynnikov, Maksym and Matos, Jason O. and Shenoy, Adithya and Wang, Michael and Nie, Yuzhou and {Sztyber-Betley}, Anna and Faraboschi, Paolo and Riblet, Robin and Crozier, Jonathan and Halasyamani, Shiv and Verma, Shreyas and Joshi, Prashant and Meril, Eli and Ma, Ziqiao and Andr{\'e}oletti, J{\'e}r{\'e}my and Singhal, Raghav and Platnick, Jacob and Nevirkovets, Volodymyr and Basler, Luke and Ivanov, Alexander and Khoury, Seri and Gustafsson, Nils and Piccardo, Marco and Mostaghimi, Hamid and Chen, Qijia and Singh, Virendra and Kh{\'a}nh, Tran Quoc and Rosu, Paul and Szlyk, Hannah and Brown, Zachary and Narayan, Himanshu and Menezes, Aline and Roberts, Jonathan and Alley, William and Sun, Kunyang and Patel, Arkil and Lamparth, Max and Reuel, Anka and Xin, Linwei and Xu, Hanmeng and Loader, Jacob and Martin, Freddie and Wang, Zixuan and Achilleos, Andrea and Preu, Thomas and Korbak, Tomek and Bosio, Ida and Kazemi, Fereshteh and Chen, Ziye and B{\'a}lint, Bir{\'o} and Lo, Eve J. Y. and Wang, Jiaqi and Nunes, Maria In{\^e}s S. and Milbauer, Jeremiah and Bari, M Saiful and Wang, Zihao and Ansarinejad, Behzad and Sun, Yewen and Durand, Stephane and Elgnainy, Hossam and Douville, Guillaume and Tordera, Daniel and Balabanian, George and Wolff, Hew and Kvistad, Lynna and Milliron, Hsiaoyun and Sakor, Ahmad and Eron, Murat and O., Andrew Favre D. and Shah, Shailesh and Zhou, Xiaoxiang and Kamalov, Firuz and Abdoli, Sherwin and Santens, Tim and Barkan, Shaul and Tee, Allison and Zhang, Robin and Tomasiello, Alessandro and Luca, G. Bruno De and Looi, Shi-Zhuo and Le, Vinh-Kha and Kolt, Noam and Pan, Jiayi and Rodman, Emma and Drori, Jacob and Fossum, Carl J and Muennighoff, Niklas and Jagota, Milind and Pradeep, Ronak and Fan, Honglu and Eicher, Jonathan and Chen, Michael and Thaman, Kushal and Merrill, William and Firsching, Moritz and Harris, Carter and Ciob{\^a}c{\u a}, Stefan and Gross, Jason and Pandey, Rohan and Gusev, Ilya and Jones, Adam and Agnihotri, Shashank and Zhelnov, Pavel and Mofayezi, Mohammadreza and Piperski, Alexander and Zhang, David K. and Dobarskyi, Kostiantyn and Leventov, Roman and Soroko, Ignat and Duersch, Joshua and Taamazyan, Vage and Ho, Andrew and Ma, Wenjie and Held, William and Xian, Ruicheng and Zebaze, Armel Randy and Mohamed, Mohanad and Leser, Julian Noah and Yuan, Michelle X and Yacar, Laila and Lengler, Johannes and Olszewska, Katarzyna and Fratta, Claudio Di and Oliveira, Edson and Jackson, Joseph W. and Zou, Andy and Chidambaram, Muthu and Manik, Timothy and Haffenden, Hector and Stander, Dashiell and Dasouqi, Ali and Shen, Alexander and Golshani, Bita and Stap, David and Kretov, Egor and Uzhou, Mikalai and Zhidkovskaya, Alina Borisovna and Winter, Nick and Rodriguez, Miguel Orbegozo and Lauff, Robert and Wehr, Dustin and Tang, Colin and Hossain, Zaki and Phillips, Shaun and Samuele, Fortuna and Ekstr{\"o}m, Fredrik and Hammon, Angela and Patel, Oam and Farhidi, Faraz and Medley, George and Mohammadzadeh, Forough and Pe{\~n}aflor, Madellene and Kassahun, Haile and Friedrich, Alena and Perez, Rayner Hernandez and Pyda, Daniel and Sakal, Taom and Dhamane, Omkar and Mirabadi, Ali Khajegili and Hallman, Eric and Okutsu, Kenchi and Battaglia, Mike and Maghsoudimehrabani, Mohammad and Amit, Alon and Hulbert, Dave and Pereira, Roberto and Weber, Simon and {Handoko} and Peristyy, Anton and Malina, Stephen and Mehkary, Mustafa and Aly, Rami and Reidegeld, Frank and Dick, Anna-Katharina and Friday, Cary and Singh, Mukhwinder and Shapourian, Hassan and Kim, Wanyoung and Costa, Mariana and Gurdogan, Hubeyb and Kumar, Harsh and Ceconello, Chiara and Zhuang, Chao and Park, Haon and Carroll, Micah and Tawfeek, Andrew R. and Steinerberger, Stefan and Aggarwal, Daattavya and Kirchhof, Michael and Dai, Linjie and Kim, Evan and Ferret, Johan and Shah, Jainam and Wang, Yuzhou and Yan, Minghao and Burdzy, Krzysztof and Zhang, Lixin and Franca, Antonio and Pham, Diana T. and Loh, Kang Yong and Robinson, Joshua and Jackson, Abram and Giordano, Paolo and Petersen, Philipp and Cosma, Adrian and Colino, Jesus and White, Colin and Votava, Jacob and Vinnikov, Vladimir and Delaney, Ethan and Spelda, Petr and Stritecky, Vit and Shahid, Syed M. and Mourrat, Jean-Christophe and Vetoshkin, Lavr and Sponselee, Koen and Bacho, Renas and Yong, Zheng-Xin and {de la Rosa}, Florencia and Cho, Nathan and Li, Xiuyu and Malod, Guillaume and Weller, Orion and Albani, Guglielmo and Lang, Leon and Laurendeau, Julien and Kazakov, Dmitry and Adesanya, Fatimah and Portier, Julien and Hollom, Lawrence and Souza, Victor and Zhou, Yuchen Anna and Degorre, Julien and Yal{\i}n, Yi{\u g}it and Obikoya, Gbenga Daniel and {Rai} and Bigi, Filippo and Bosc{\'a}, M. C. and Shumar, Oleg and Bacho, Kaniuar and Recchia, Gabriel and Popescu, Mara and Shulga, Nikita and Tanwie, Ngefor Mildred and Lux, Thomas C. H. and Rank, Ben and Ni, Colin and Brooks, Matthew and Yakimchyk, Alesia and {Huanxu} and {Liu} and Cavalleri, Stefano and H{\"a}ggstr{\"o}m, Olle and Verkama, Emil and Newbould, Joshua and Gundlach, Hans and {Brito-Santana}, Leonor and Amaro, Brian and Vajipey, Vivek and Grover, Rynaa and Wang, Ting and Kratish, Yosi and Li, Wen-Ding and Gopi, Sivakanth and Caciolai, Andrea and {de Witt}, Christian Schroeder and {Hern{\'a}ndez-C{\'a}mara}, Pablo and Rodol{\`a}, Emanuele and Robins, Jules and Williamson, Dominic and Cheng, Vincent and Raynor, Brad and Qi, Hao and Segev, Ben and Fan, Jingxuan and Martinson, Sarah and Wang, Erik Y. and Hausknecht, Kaylie and Brenner, Michael P. and Mao, Mao and Demian, Christoph and Kassani, Peyman and Zhang, Xinyu and Avagian, David and Scipio, Eshawn Jessica and Ragoler, Alon and Tan, Justin and Sims, Blake and Plecnik, Rebeka and Kirtland, Aaron and Bodur, Omer Faruk and Shinde, D. P. and Labrador, Yan Carlos Leyva and Adoul, Zahra and Zekry, Mohamed and Karakoc, Ali and Santos, Tania C. B. and Shamseldeen, Samir and Karim, Loukmane and Liakhovitskaia, Anna and Resman, Nate and Farina, Nicholas and Gonzalez, Juan Carlos and Maayan, Gabe and Anderson, Earth and Pena, Rodrigo De Oliveira and Kelley, Elizabeth and Mariji, Hodjat and Pouriamanesh, Rasoul and Wu, Wentao and Finocchio, Ross and Alarab, Ismail and Cole, Joshua and Ferreira, Danyelle and Johnson, Bryan and Safdari, Mohammad and Dai, Liangti and Arthornthurasuk, Siriphan and McAlister, Isaac C. and Moyano, Alejandro Jos{\'e} and Pronin, Alexey and Fan, Jing and {Ramirez-Trinidad}, Angel and Malysheva, Yana and Pottmaier, Daphiny and Taheri, Omid and Stepanic, Stanley and Perry, Samuel and Askew, Luke and Rodr{\'i}guez, Ra{\'u}l Adri{\'a}n Huerta and Minissi, Ali M. R. and Lorena, Ricardo and Iyer, Krishnamurthy and Fasiludeen, Arshad Anil and Clark, Ronald and Ducey, Josh and Piza, Matheus and Somrak, Maja and Vergo, Eric and Qin, Juehang and Borb{\'a}s, Benj{\'a}min and Chu, Eric and Lindsey, Jack and Jallon, Antoine and McInnis, I. M. J. and Chen, Evan and Semler, Avi and Gloor, Luk and Shah, Tej and Carauleanu, Marc and Lauer, Pascal and Huy, Tran {\DJ}uc and Shahrtash, Hossein and Duc, Emilien and Lewark, Lukas and Brown, Assaf and Albanie, Samuel and Weber, Brian and Vaz, Warren S. and Clavier, Pierre and Fan, Yiyang and {e Silva}, Gabriel Poesia Reis and {Long} and {Lian} and Abramovitch, Marcus and Jiang, Xi and Mendoza, Sandra and Islam, Murat and Gonzalez, Juan and Mavroudis, Vasilios and Xu, Justin and Kumar, Pawan and Goswami, Laxman Prasad and Bugas, Daniel and Heydari, Nasser and Jeanplong, Ferenc and Jansen, Thorben and Pinto, Antonella and Apronti, Archimedes and Galal, Abdallah and {Ze-An}, Ng and Singh, Ankit and Jiang, Tong and {of Arc Xavier}, Joan and Agarwal, Kanu Priya and Berkani, Mohammed and Zhang, Gang and Du, Zhehang and {de Oliveira Junior}, Benedito Alves and Malishev, Dmitry and Remy, Nicolas and Hartman, Taylor D. and Tarver, Tim and Mensah, Stephen and Loume, Gautier Abou and Morak, Wiktor and Habibi, Farzad and Hoback, Sarah and Cai, Will and Gimenez, Javier and Montecillo, Roselynn Grace and {\L}ucki, Jakub and Campbell, Russell and Sharma, Asankhaya and Meer, Khalida and Gul, Shreen and Gonzalez, Daniel Espinosa and Alapont, Xavier and Hoover, Alex and Chhablani, Gunjan and Vargus, Freddie and Agarwal, Arunim and Jiang, Yibo and Patil, Deepakkumar and Outevsky, David and Scaria, Kevin Joseph and Maheshwari, Rajat and Dendane, Abdelkader and Shukla, Priti and Cartwright, Ashley and Bogdanov, Sergei and M{\"u}ndler, Niels and M{\"o}ller, S{\"o}ren and Arnaboldi, Luca and Thaman, Kunvar and Siddiqi, Muhammad Rehan and Saxena, Prajvi and Gupta, Himanshu and Fruhauff, Tony and Sherman, Glen and Vincze, M{\'a}ty{\'a}s and Usawasutsakorn, Siranut and Ler, Dylan and Radhakrishnan, Anil and Enyekwe, Innocent and Salauddin, Sk Md and Muzhen, Jiang and Maksapetyan, Aleksandr and Rossbach, Vivien and Harjadi, Chris and Bahaloohoreh, Mohsen and Sparrow, Claire and Sidhu, Jasdeep and Ali, Sam and Bian, Song and Lai, John and Singer, Eric and Uro, Justine Leon and Bateman, Greg and Sayed, Mohamed and Menshawy, Ahmed and Duclosel, Darling and Bezzi, Dario and Jain, Yashaswini and Aaron, Ashley and Tiryakioglu, Murat and Siddh, Sheeshram and Krenek, Keith and Shah, Imad Ali and Jin, Jun and Creighton, Scott and Peskoff, Denis and {EL-Wasif}, Zienab and V, Ragavendran P and Richmond, Michael and McGowan, Joseph and Patwardhan, Tejal and Sun, Hao-Yu and Sun, Ting and Zubi{\'c}, Nikola and Sala, Samuele and Ebert, Stephen and Kaddour, Jean and Schottdorf, Manuel and Wang, Dianzhuo and Petruzella, Gerol and Meiburg, Alex and Medved, Tilen and ElSheikh, Ali and Hebbar, S Ashwin and Vaquero, Lorenzo and Yang, Xianjun and Poulos, Jason and Zouhar, Vil{\'e}m and Bogdanik, Sergey and Zhang, Mingfang and {Sanz-Ros}, Jorge and Anugraha, David and Dai, Yinwei and Nhu, Anh N. and Wang, Xue and Demircali, Ali Anil and Jia, Zhibai and Zhou, Yuyin and Wu, Juncheng and He, Mike and Chandok, Nitin and Sinha, Aarush and Luo, Gaoxiang and Le, Long and Noy{\'e}, Micka{\"e}l and Pere{\l}kiewicz, Micha{\l} and Pantidis, Ioannis and Qi, Tianbo and Purohit, Soham Sachin and Parcalabescu, Letitia and Nguyen, Thai-Hoa and Winata, Genta Indra and Ponti, Edoardo M. and Li, Hanchen and Dhole, Kaustubh and Park, Jongee and Abbondanza, Dario and Wang, Yuanli and Nayak, Anupam and Caetano, Diogo M. and Wong, Antonio A. W. L. and {del Rio-Chanona}, Maria and Kondor, D{\'a}niel and Francois, Pieter and Chalstrey, Ed and Zsambok, Jakob and Hoyer, Dan and Reddish, Jenny and Hauser, Jakob and {Rodrigo-Gin{\'e}s}, Francisco-Javier and Datta, Suchandra and Shepherd, Maxwell and Kamphuis, Thom and Zhang, Qizheng and Kim, Hyunjun and Sun, Ruiji and Yao, Jianzhu and Dernoncourt, Franck and Krishna, Satyapriya and Rismanchian, Sina and Pu, Bonan and Pinto, Francesco and Wang, Yingheng and Shridhar, Kumar and Overholt, Kalon J. and Briia, Glib and Nguyen, Hieu and {David} and Bartomeu, Soler and Pang, Tony CY and Wecker, Adam and Xiong, Yifan and Li, Fanfei and Huber, Lukas S. and Jaeger, Joshua and Maddalena, Romano De and L{\`u}, Xing Han and Zhang, Yuhui and Beger, Claas and Kon, Patrick Tser Jern and Li, Sean and Sanker, Vivek and Yin, Ming and Liang, Yihao and Zhang, Xinlu and Agrawal, Ankit and Yifei, Li S. and Zhang, Zechen and Cai, Mu and Sonmez, Yasin and Cozianu, Costin and Li, Changhao and Slen, Alex and Yu, Shoubin and Park, Hyun Kyu and Sarti, Gabriele and Bria{\'n}ski, Marcin and Stolfo, Alessandro and Nguyen, Truong An and Zhang, Mike and Perlitz, Yotam and {Hernandez-Orallo}, Jose and Li, Runjia and Shabani, Amin and {Juefei-Xu}, Felix and Dhingra, Shikhar and Zohar, Orr and Nguyen, My Chiffon and Pondaven, Alexander and Yilmaz, Abdurrahim and Zhao, Xuandong and Jin, Chuanyang and Jiang, Muyan and Todoran, Stefan and Han, Xinyao and Kreuer, Jules and Rabern, Brian and Plassart, Anna and Maggetti, Martino and Yap, Luther and Geirhos, Robert and Kean, Jonathon and Wang, Dingsu and Mollaei, Sina and Sun, Chenkai and Yin, Yifan and Wang, Shiqi and Li, Rui and Chang, Yaowen and Wei, Anjiang and Bizeul, Alice and Wang, Xiaohan and Arrais, Alexandre Oliveira and Mukherjee, Kushin and {Chamorro-Padial}, Jorge and Liu, Jiachen and Qu, Xingyu and Guan, Junyi and Bouyamourn, Adam and Wu, Shuyu and Plomecka, Martyna and Chen, Junda and Tang, Mengze and Deng, Jiaqi and Subramanian, Shreyas and Xi, Haocheng and Chen, Haoxuan and Zhang, Weizhi and Ren, Yinuo and Tu, Haoqin and Kim, Sejong and Chen, Yushun and Marjanovi{\'c}, Sara Vera and Ha, Junwoo and Luczyna, Grzegorz and Ma, Jeff J. and Shen, Zewen and Song, Dawn and Zhang, Cedegao E. and Wang, Zhun and Gendron, Ga{\"e}l and Xiao, Yunze and Smucker, Leo and Weng, Erica and Lee, Kwok Hao and Ye, Zhe and Ermon, Stefano and {Lopez-Miguel}, Ignacio D. and Knights, Theo and Gitter, Anthony and Park, Namkyu and Wei, Boyi and Chen, Hongzheng and Pai, Kunal and Elkhanany, Ahmed and Lin, Han and Siedler, Philipp D. and Fang, Jichao and Mishra, Ritwik and {Zsolnai-Feh{\'e}r}, K{\'a}roly and Jiang, Xilin and Khan, Shadab and Yuan, Jun and Jain, Rishab Kumar and Lin, Xi and Peterson, Mike and Wang, Zhe and Malusare, Aditya and Tang, Maosen and Gupta, Isha and Fosin, Ivan and Kang, Timothy and Dworakowska, Barbara and Matsumoto, Kazuki and Zheng, Guangyao and Sewuster, Gerben and Villanueva, Jorge Pretel and Rannev, Ivan and Chernyavsky, Igor and Chen, Jiale and Banik, Deepayan and Racz, Ben and Dong, Wenchao and Wang, Jianxin and Bashmal, Laila and Gon{\c c}alves, Duarte V. and Hu, Wei and Bar, Kaushik and Bohdal, Ondrej and Patlan, Atharv Singh and Dhuliawala, Shehzaad and Geirhos, Caroline and Wist, Julien and Kansal, Yuval and Chen, Bingsen and Tire, Kutay and Y{\"u}cel, Atak Talay and Christof, Brandon and Singla, Veerupaksh and Song, Zijian and Chen, Sanxing and Ge, Jiaxin and Ponkshe, Kaustubh and Park, Isaac and Shi, Tianneng and Ma, Martin Q. and Mak, Joshua and Lai, Sherwin and Moulin, Antoine and Cheng, Zhuo and Zhu, Zhanda and Zhang, Ziyi and Patil, Vaidehi and Jha, Ketan and Men, Qiutong and Wu, Jiaxuan and Zhang, Tianchi and Vieira, Bruno Hebling and Aji, Alham Fikri and Chung, Jae-Won and Mahfoud, Mohammed and Hoang, Ha Thi and Sperzel, Marc and Hao, Wei and Meding, Kristof and Xu, Sihan and Kostakos, Vassilis and Manini, Davide and Liu, Yueying and Toukmaji, Christopher and Paek, Jay and Yu, Eunmi and Demircali, Arif Engin and Sun, Zhiyi and Dewerpe, Ivan and Qin, Hongsen and Pflugfelder, Roman and Bailey, James and Morris, Johnathan and Heilala, Ville and Rosset, Sybille and Yu, Zishun and Chen, Peter E. and Yeo, Woongyeong and Jain, Eeshaan and Yang, Ryan and Chigurupati, Sreekar and Chernyavsky, Julia and Reddy, Sai Prajwal and Venugopalan, Subhashini and Batra, Hunar and Park, Core Francisco and Tran, Hieu and Maximiano, Guilherme and Zhang, Genghan and Liang, Yizhuo and Shiyu, Hu and Xu, Rongwu and Pan, Rui and Suresh, Siddharth and Liu, Ziqi and Gulati, Samaksh and Zhang, Songyang and Turchin, Peter and Bartlett, Christopher W. and Scotese, Christopher R. and Cao, Phuong M. and Wu, Ben and Karwowski, Jacek and Scaramuzza, Davide and Nattanmai, Aakaash and McKellips, Gordon and Cheraku, Anish and Suhail, Asim and Luo, Ethan and Deng, Marvin and Luo, Jason and Zhang, Ashley and Jindel, Kavin and Paek, Jay and Halevy, Kasper and Baranov, Allen and Liu, Michael and Avadhanam, Advaith and Zhang, David and Cheng, Vincent and Ma, Brad and Fu, Evan and Do, Liam and Lass, Joshua and Yang, Hubert and Sunkari, Surya and Bharath, Vishruth and Ai, Violet and Leung, James and Agrawal, Rishit and Zhou, Alan and Chen, Kevin and Kalpathi, Tejas and Xu, Ziqi and Wang, Gavin and Xiao, Tyler and Maung, Erik and Lee, Sam and Yang, Ryan and Yue, Roy and Zhao, Ben and Yoon, Julia and Sun, Sunny and Singh, Aryan and Luo, Ethan and Peng, Clark and Osbey, Tyler and Wang, Taozhi and Echeazu, Daryl and Yang, Hubert and Wu, Timothy and Patel, Spandan and Kulkarni, Vidhi and Sundarapandiyan, Vijaykaarti and Zhang, Ashley and Le, Andrew and Nasim, Zafir and Yalam, Srikar and Kasamsetty, Ritesh and Samal, Soham and Yang, Hubert and Sun, David and Shah, Nihar and Saha, Abhijeet and Zhang, Alex and Nguyen, Leon and Nagumalli, Laasya and Wang, Kaixin and Zhou, Alan and Wu, Aidan and Luo, Jason and Telluri, Anwith and Yue, Summer and Wang, Alexandr and Hendrycks, Dan},
  year = 2025,
  eprint = {2501.14249},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv}
}

@article{philipgageNewAlgorithmData1994,
  title = {A {{New Algorithm}} for {{Data Compression}}},
  author = {{Philip Gage}},
  year = 1994,
  month = feb,
  journal = {The C Users Journal},
  number = {FEB94},
  urldate = {2025-12-06},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/4VQ8GBEC/19940045.html}
}

@book{pinkerBetterAngelsOur2012,
  title = {The {{Better Angels}} of {{Our Nature}}},
  author = {Pinker, Steven},
  year = 2012,
  month = oct,
  urldate = {2026-01-27},
  abstract = {'The most inspiring book I've ever read' Bill Gates, 2017 'A brilliant, mind-altering book ... Everyone should read this astonishing book' Guardian 'Will change the way you see the world' Daily Mail Shortlisted for the Samuel Johnson Prize 2012 Wasn't the twentieth century the most violent in history? In his extraordinary, epic book Steven Pinker shows us that this is wrong, telling the story of humanity in a completely new and unfamiliar way. From why cities make us safer to how books bring about peace, Pinker weaves together history, philosophy and science to examine why we are less likely to die at another's hand than ever before, how it happened and what it tells us about our very natures. 'May prove to be one of the great books of our time ... he writes like an angel' Economist 'Masterly, a supremely important book ... For anyone interested in human nature, it is engrossing' The New York Times 'Marvellous ... riveting and myth-destroying' New Statesman 'A marvellous synthesis of science, history and storytelling, written in Pinker's distinctively entertaining and clear personal style ... I was astonished by the extent to which violence has declined in every shape, form and scale' Financial Times 'An outstandingly fruitful read, with fascinating nuggets on almost every page' Sunday Times, Books of the Year},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/HQEBU32H/9780141034645.html}
}

@article{podsakoffCommonMethodBiases2003,
  title = {Common Method Biases in Behavioral Research: A Critical Review of the Literature and Recommended Remedies},
  shorttitle = {Common Method Biases in Behavioral Research},
  author = {Podsakoff, Philip M. and MacKenzie, Scott B. and Lee, Jeong-Yeon and Podsakoff, Nathan P.},
  year = 2003,
  month = oct,
  journal = {The Journal of Applied Psychology},
  volume = {88},
  number = {5},
  pages = {879--903},
  issn = {0021-9010},
  doi = {10.1037/0021-9010.88.5.879},
  abstract = {Interest in the problem of method biases has a long history in the behavioral sciences. Despite this, a comprehensive summary of the potential sources of method biases and how to control for them does not exist. Therefore, the purpose of this article is to examine the extent to which method biases influence behavioral research results, identify potential sources of method biases, discuss the cognitive processes through which method biases influence responses to measures, evaluate the many different procedural and statistical techniques that can be used to control method biases, and provide recommendations for how to select appropriate procedural and statistical remedies for different types of research settings.},
  langid = {english},
  pmid = {14516251},
  keywords = {Behavior,Bias,Humans,Psychology Applied,Research Design,Statistics as Topic}
}

@misc{publicaiPublicAIWhite2024,
  title = {Public {{AI White Paper}}},
  author = {{Public AI}},
  year = 2024,
  month = aug,
  doi = {10.5281/zenodo.13914560},
  urldate = {2025-11-17},
  abstract = {In this paper, we set out a vision for a different path for AI.  It starts with a recognition that societies don't have to just consume the AI technologies shaping their lives---they can create them.  That's why we call for a new collective enterprise: building AI infrastructure for the common good. Public investments can unleash a wave of innovation, expanding access to better tools, and in time expanding our collective imagination.  The result is a new political economy.  This is Public AI.},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/5N52Z9MK/Public AI - Public AI White Paper.pdf.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/H2CLMI63/edit.html}
}

@misc{qiSafetyAlignmentShould2024,
  title = {Safety {{Alignment Should Be Made More Than Just}} a {{Few Tokens Deep}}},
  author = {Qi, Xiangyu and Panda, Ashwinee and Lyu, Kaifeng and Ma, Xiao and Roy, Subhrajit and Beirami, Ahmad and Mittal, Prateek and Henderson, Peter},
  year = 2024,
  month = jun,
  number = {arXiv:2406.05946},
  eprint = {2406.05946},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2406.05946},
  urldate = {2025-12-01},
  abstract = {The safety alignment of current Large Language Models (LLMs) is vulnerable. Relatively simple attacks, or even benign fine-tuning, can jailbreak aligned models. We argue that many of these vulnerabilities are related to a shared underlying issue: safety alignment can take shortcuts, wherein the alignment adapts a model's generative distribution primarily over only its very first few output tokens. We refer to this issue as shallow safety alignment. In this paper, we present case studies to explain why shallow safety alignment can exist and provide evidence that current aligned LLMs are subject to this issue. We also show how these findings help explain multiple recently discovered vulnerabilities in LLMs, including the susceptibility to adversarial suffix attacks, prefilling attacks, decoding parameter attacks, and fine-tuning attacks. Importantly, we discuss how this consolidated notion of shallow safety alignment sheds light on promising research directions for mitigating these vulnerabilities. For instance, we show that deepening the safety alignment beyond just the first few tokens can often meaningfully improve robustness against some common exploits. Finally, we design a regularized finetuning objective that makes the safety alignment more persistent against fine-tuning attacks by constraining updates on initial tokens. Overall, we advocate that future safety alignment should be made more than just a few tokens deep.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Cryptography and Security},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/T4EU8HKR/Qi et al. - 2024 - Safety Alignment Should Be Made More Than Just a Few Tokens Deep.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/IC5XQ9SS/2406.html}
}

@misc{qwen3.5,
  title = {Qwen3.5: {{Towards}} Native Multimodal Agents},
  author = {{Qwen Team}},
  year = 2026,
  month = feb
}

@article{r.h.coarseProblemSocialCost,
  title = {The {{Problem}} of {{Social Cost}}: {{The Journal}} of {{Law}} and {{Economics}}: {{Vol}} 3},
  shorttitle = {The {{Problem}} of {{Social Cost}}},
  author = {{R. H. Coarse}},
  journal = {The Journal of Law and Economics},
  urldate = {2026-02-02},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/CRK3V3ID/466560.html}
}

@inproceedings{radfordLanguageModelsAre2019,
  title = {Language {{Models}} Are {{Unsupervised Multitask Learners}}},
  author = {Radford, Alec and Wu, Jeff and Child, R. and Luan, D. and Amodei, Dario and Sutskever, I.},
  year = 2019,
  urldate = {2026-01-25},
  abstract = {Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets. We demonstrate that language models begin to learn these tasks without any explicit supervision when trained on a new dataset of millions of webpages called WebText. When conditioned on a document plus questions, the answers generated by the language model reach 55 F1 on the CoQA dataset matching or exceeding the performance of 3 out of 4 baseline systems without using the 127,000+ training examples. The capacity of the language model is essential to the success of zero-shot task transfer and increasing it improves performance in a log-linear fashion across tasks. Our largest model, GPT-2, is a 1.5B parameter Transformer that achieves state of the art results on 7 out of 8 tested language modeling datasets in a zero-shot setting but still underfits WebText. Samples from the model reflect these improvements and contain coherent paragraphs of text. These findings suggest a promising path towards building language processing systems which learn to perform tasks from their naturally occurring demonstrations.}
}

@misc{rafailovDirectPreferenceOptimization2024,
  title = {Direct {{Preference Optimization}}: {{Your Language Model}} Is {{Secretly}} a {{Reward Model}}},
  shorttitle = {Direct {{Preference Optimization}}},
  author = {Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Ermon, Stefano and Manning, Christopher D. and Finn, Chelsea},
  year = 2024,
  month = jul,
  number = {arXiv:2305.18290},
  eprint = {2305.18290},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2305.18290},
  urldate = {2025-10-25},
  abstract = {While large-scale unsupervised language models (LMs) learn broad world knowledge and some reasoning skills, achieving precise control of their behavior is difficult due to the completely unsupervised nature of their training. Existing methods for gaining such steerability collect human labels of the relative quality of model generations and fine-tune the unsupervised LM to align with these preferences, often with reinforcement learning from human feedback (RLHF). However, RLHF is a complex and often unstable procedure, first fitting a reward model that reflects the human preferences, and then fine-tuning the large unsupervised LM using reinforcement learning to maximize this estimated reward without drifting too far from the original model. In this paper we introduce a new parameterization of the reward model in RLHF that enables extraction of the corresponding optimal policy in closed form, allowing us to solve the standard RLHF problem with only a simple classification loss. The resulting algorithm, which we call Direct Preference Optimization (DPO), is stable, performant, and computationally lightweight, eliminating the need for sampling from the LM during fine-tuning or performing significant hyperparameter tuning. Our experiments show that DPO can fine-tune LMs to align with human preferences as well as or better than existing methods. Notably, fine-tuning with DPO exceeds PPO-based RLHF in ability to control sentiment of generations, and matches or improves response quality in summarization and single-turn dialogue while being substantially simpler to implement and train.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/IVLQ6B7N/Rafailov et al. - 2024 - Direct Preference Optimization Your Language Model is Secretly a Reward Model.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/NMSA44TU/2305.html}
}

@book{rawlsTheoryJusticeRevised1999,
  title = {A {{Theory}} of {{Justice}}: {{Revised Edition}}},
  shorttitle = {A {{Theory}} of {{Justice}}},
  author = {Rawls, John},
  year = 1999,
  eprint = {j.ctvkjb25m},
  eprinttype = {jstor},
  publisher = {Harvard University Press},
  doi = {10.2307/j.ctvkjb25m},
  urldate = {2025-07-08},
  abstract = {Since it appeared in 1971, John Rawls's {$<$}em{$>$}A Theory of Justice{$<$}/em{$>$} has become a classic. The author has now revised the original edition to clear up a number of difficulties he and others have found in the original book. Rawls aims to express an essential part of the common core of the democratic tradition--justice as fairness--and to provide an alternative to utilitarianism, which had dominated the Anglo-Saxon tradition of political thought since the nineteenth century. Rawls substitutes the ideal of the social contract as a more satisfactory account of the basic rights and liberties of citizens as free and equal persons. "Each person," writes Rawls, "possesses an inviolability founded on justice that even the welfare of society as a whole cannot override." Advancing the ideas of Rousseau, Kant, Emerson, and Lincoln, Rawls's theory is as powerful today as it was when first published.},
  isbn = {978-0-674-00077-3}
}

@book{roethlisbergerManagementWorker1939,
  title = {Management and the Worker},
  author = {Roethlisberger, F. J.},
  year = 1939,
  publisher = {Cambridge, Mass. : Harvard Uuniversity Press},
  urldate = {2026-02-05},
  abstract = {http://uf.catalog.fcla.edu/uf.jsp?st=UF021927079\&ix=pm\&I=0\&V=D\&pm=1},
  collaborator = {{University of Florida}, George A. Smathers Libraries},
  langid = {english},
  keywords = {Research Industrial}
}

@book{rokeachNatureHumanValues1973,
  title = {The {{Nature}} of {{Human Values}}},
  author = {Rokeach, Milton},
  year = 1973,
  publisher = {Free Press},
  abstract = {Milton Rokeach's book The Nature of Human Values (1973), and the Rokeach Value Survey, which the book served as the test manual for, occupied the final years of his career. In it, he posited that a relatively few "terminal human values" are the internal reference points that all people use to formulate attitudes and opinions, and that by measuring the "relative ranking" of these values one could predict a wide variety of behavior, including political affiliation and religious belief. This theory led to a series of experiments in which changes in values led to measurable changes in opinion for an entire small city in the state of Washington.},
  googlebooks = {fUdqAAAAMAAJ},
  isbn = {978-0-02-926750-9},
  langid = {english},
  keywords = {Philosophy / Ethics & Moral Philosophy}
}

@book{rossRightGood2002,
  title = {The {{Right}} and the {{Good}}},
  author = {Ross, David},
  editor = {{Stratton-Lake}, Philip},
  year = 2002,
  month = aug,
  publisher = {Oxford University Press},
  doi = {10.1093/0199252653.001.0001},
  urldate = {2026-01-27},
  abstract = {The Right and the Good is a classic of 20th-century philosophy by the great scholar Sir David Ross, which is now presented in a new edition with a substantial introduction by Philip Stratton--Lake, a leading expert on Ross. Ross's book was originally published in 1930, and is the pinnacle of ethical intuitionism, which was the dominant moral theory in British philosophy for much of the 19th and early 20th century. The central concern of the book is with rightness and goodness, and their relation to one another. Ross argues against notable rival ethical theories. The right act, he holds, cannot be derived from the moral value of the motive from which it is done; furthermore, rightness is not wholly determined by the value of the consequences of one's action, whether this value is some benefit for the agent, or some agent-neutral good. Rather, the right act is determined by a plurality of self-evident prima facie duties. Ross portrayed rightness and goodness as simple non-natural properties. Philip Stratton provides a substantial introduction to the book, in which he discusses its central themes and clears up some common misunderstandings. A new bibliography and index are also included, along with editorial notes that aim to clarify certain points and indicate where Ross later changed his mind on particular issues. Intuitionism is now enjoying a considerable revival, and this new edition provides the context for a proper modern understanding of Ross's great work.},
  isbn = {978-0-19-925265-7}
}

@book{russellArtificialIntelligenceGlobal2021,
  title = {Artificial {{Intelligence}}, {{Global Edition}}},
  author = {Russell, Stuart and Norvig, Peter},
  year = 2021,
  publisher = {Pearson Education},
  urldate = {2026-01-20},
  abstract = {The most comprehensive, up-to-date introduction to the theory and practice of artificial intelligence The long-anticipated revision of Artificial Intelligence: A Modern Approach explores the full breadth and depth of the field of artificial intelligence (AI). The 4th Edition brings readers up to date on the latest technologies, present concepts in a more unified manner, and offers new or expanded coverage of machine learning, deep learning, transfer learning, multi agent systems, robotics, natural language processing, causality, probabilistic programming, privacy, fairness, and safe AI.},
  copyright = {Pearson Deutschland},
  isbn = {978-1-292-40117-1},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/4BX7SC9B/9781292401171.html}
}

@incollection{russellh.fazioDirectExperienceAttitudeBehavior1981,
  title = {Direct {{Experience And Attitude-Behavior Consistency}}},
  booktitle = {Advances in {{Experimental Social Psychology}}},
  author = {{Russell H. Fazio} and {Mark P. Zanna}},
  year = 1981,
  month = jan,
  volume = {14},
  pages = {161--202},
  publisher = {Academic Press},
  issn = {0065-2601},
  doi = {10.1016/S0065-2601(08)60372-X},
  urldate = {2025-12-07},
  abstract = {The chapter discusses the role of the manner of attitude formation. It focuses on the development of an attitude through direct behavioral experience \dots},
  langid = {american},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/D8HLGPW8/S006526010860372X.html}
}

@article{saminUnlessItsGovernance2025,
  title = {Unless Its Governance Changes, {{Anthropic}} Is Untrustworthy},
  author = {Samin, Mikhail},
  year = 2025,
  month = nov,
  urldate = {2025-12-16},
  abstract = {Anthropic is untrustworthy. {$\bullet$} This post provides arguments, asks questions, and documents some examples of Anthropic's leadership being misleading an\dots},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SJCNTJFV/unless-its-governance-changes-anthropic-is-untrustworthy.html}
}

@article{samuelsonPureTheoryPublic1954,
  title = {The {{Pure Theory}} of {{Public Expenditure}}},
  author = {Samuelson, Paul A.},
  year = 1954,
  journal = {The Review of Economics and Statistics},
  volume = {36},
  number = {4},
  eprint = {1925895},
  eprinttype = {jstor},
  pages = {387--389},
  publisher = {The MIT Press},
  issn = {0034-6535},
  doi = {10.2307/1925895},
  urldate = {2026-02-02}
}

@article{sandbergWholeBrainEmulation2008,
  title = {Whole Brain Emulation: A Roadmap},
  shorttitle = {Whole Brain Emulation},
  author = {Sandberg, A. and Bostrom, N.},
  year = 2008,
  publisher = {Future of Humanity Institute},
  urldate = {2025-12-23},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/GAYVCWWC/Sandberg and Bostrom - 2008 - Whole brain emulation a roadmap.pdf}
}

@misc{sandbrinkArtificialIntelligenceBiological2023,
  title = {Artificial Intelligence and Biological Misuse: {{Differentiating}} Risks of Language Models and Biological Design Tools},
  shorttitle = {Artificial Intelligence and Biological Misuse},
  author = {Sandbrink, Jonas B.},
  year = 2023,
  month = dec,
  number = {arXiv:2306.13952},
  eprint = {2306.13952},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2306.13952},
  urldate = {2026-01-05},
  abstract = {As advancements in artificial intelligence (AI) propel progress in the life sciences, they may also enable the weaponisation and misuse of biological agents. This article differentiates two classes of AI tools that could pose such biosecurity risks: large language models (LLMs) and biological design tools (BDTs). LLMs, such as GPT-4 and its successors, might provide dual-use information and thus remove some barriers encountered by historical biological weapons efforts. As LLMs are turned into multi-modal lab assistants and autonomous science tools, this will increase their ability to support non-experts in performing laboratory work. Thus, LLMs may in particular lower barriers to biological misuse. In contrast, BDTs will expand the capabilities of sophisticated actors. Concretely, BDTs may enable the creation of pandemic pathogens substantially worse than anything seen to date and could enable forms of more predictable and targeted biological weapons. In combination, the convergence of LLMs and BDTs could raise the ceiling of harm from biological agents and could make them broadly accessible. A range of interventions would help to manage risks. Independent pre-release evaluations could help understand the capabilities of models and the effectiveness of safeguards. Options for differentiated access to such tools should be carefully weighed with the benefits of openly releasing systems. Lastly, essential for mitigating risks will be universal and enhanced screening of gene synthesis products.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computers and Society},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/WDATB6E6/Sandbrink - 2023 - Artificial intelligence and biological misuse Differentiating risks of language models and biologic.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/3Q5WZ4ZY/2306.html}
}

@misc{sandersBuildAIPeople2023,
  title = {Build {{AI}} by the {{People}}, for the {{People}}},
  author = {Sanders, Nathan E., Bruce Schneier},
  year = 2023,
  month = jun,
  journal = {Foreign Policy},
  urldate = {2025-12-23},
  abstract = {Washington needs to take AI investment out of the hands of private companies.},
  langid = {american}
}

@misc{santurkarWhoseOpinionsLanguage2023,
  title = {Whose {{Opinions Do Language Models Reflect}}?},
  author = {Santurkar, Shibani and Durmus, Esin and Ladhak, Faisal and Lee, Cinoo and Liang, Percy and Hashimoto, Tatsunori},
  year = 2023,
  month = mar,
  number = {arXiv:2303.17548},
  eprint = {2303.17548},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2303.17548},
  urldate = {2025-12-03},
  abstract = {Language models (LMs) are increasingly being used in open-ended contexts, where the opinions reflected by LMs in response to subjective queries can have a profound impact, both on user satisfaction, as well as shaping the views of society at large. In this work, we put forth a quantitative framework to investigate the opinions reflected by LMs -- by leveraging high-quality public opinion polls and their associated human responses. Using this framework, we create OpinionsQA, a new dataset for evaluating the alignment of LM opinions with those of 60 US demographic groups over topics ranging from abortion to automation. Across topics, we find substantial misalignment between the views reflected by current LMs and those of US demographic groups: on par with the Democrat-Republican divide on climate change. Notably, this misalignment persists even after explicitly steering the LMs towards particular demographic groups. Our analysis not only confirms prior observations about the left-leaning tendencies of some human feedback-tuned LMs, but also surfaces groups whose opinions are poorly reflected by current LMs (e.g., 65+ and widowed individuals). Our code and data are available at https://github.com/tatsu-lab/opinions\_qa.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/CIVS4P7W/Santurkar et al. - 2023 - Whose Opinions Do Language Models Reflect.pdf}
}

@book{savageFoundationsStatistics1954,
  title = {The Foundations of Statistics},
  author = {Savage, Leonard J.},
  year = 1954,
  publisher = {New York, Wiley},
  urldate = {2026-01-26},
  abstract = {294 p. 24 cm},
  collaborator = {{Internet Archive}},
  langid = {english},
  keywords = {Statistics}
}

@misc{schickToolformerLanguageModels2023,
  title = {Toolformer: {{Language Models Can Teach Themselves}} to {{Use Tools}}},
  shorttitle = {Toolformer},
  author = {Schick, Timo and {Dwivedi-Yu}, Jane and Dess{\`i}, Roberto and Raileanu, Roberta and Lomeli, Maria and Zettlemoyer, Luke and Cancedda, Nicola and Scialom, Thomas},
  year = 2023,
  month = feb,
  number = {arXiv:2302.04761},
  eprint = {2302.04761},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2302.04761},
  urldate = {2026-01-20},
  abstract = {Language models (LMs) exhibit remarkable abilities to solve new tasks from just a few examples or textual instructions, especially at scale. They also, paradoxically, struggle with basic functionality, such as arithmetic or factual lookup, where much simpler and smaller models excel. In this paper, we show that LMs can teach themselves to use external tools via simple APIs and achieve the best of both worlds. We introduce Toolformer, a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction. This is done in a self-supervised way, requiring nothing more than a handful of demonstrations for each API. We incorporate a range of tools, including a calculator, a Q\textbackslash\&A system, two different search engines, a translation system, and a calendar. Toolformer achieves substantially improved zero-shot performance across a variety of downstream tasks, often competitive with much larger models, without sacrificing its core language modeling abilities.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/H8UVSBUR/Schick et al. - 2023 - Toolformer Language Models Can Teach Themselves to Use Tools.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/QXX9WA7Z/2302.html}
}

@misc{schoenStressTestingDeliberative2025,
  title = {Stress {{Testing Deliberative Alignment}} for {{Anti-Scheming Training}}},
  author = {Schoen, Bronson and Nitishinskaya, Evgenia and Balesni, Mikita and H{\o}jmark, Axel and Hofst{\"a}tter, Felix and Scheurer, J{\'e}r{\'e}my and Meinke, Alexander and Wolfe, Jason and van der Weij, Teun and Lloyd, Alex and {Goldowsky-Dill}, Nicholas and Fan, Angela and Matveiakin, Andrei and Shah, Rusheb and Williams, Marcus and Glaese, Amelia and Barak, Boaz and Zaremba, Wojciech and Hobbhahn, Marius},
  year = 2025,
  month = sep,
  number = {arXiv:2509.15541},
  eprint = {2509.15541},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2509.15541},
  urldate = {2025-10-28},
  abstract = {Highly capable AI systems could secretly pursue misaligned goals -- what we call "scheming". Because a scheming AI would deliberately try to hide its misaligned goals and actions, measuring and mitigating scheming requires different strategies than are typically used in ML. We propose that assessing anti-scheming interventions requires at least (1) testing propensity to scheme on far out-of-distribution (OOD) tasks, (2) evaluating whether lack of scheming is driven by situational awareness, and (3) checking for robustness to pre-existing misaligned goals. We use a broad category of "covert actions" -- such as secretly breaking rules or intentionally underperforming in tests -- as a proxy for scheming, and design evaluations for covert actions. We then stress-test deliberative alignment as a case study for anti-scheming. Across 26 OOD evaluations (180+ environments), deliberative alignment reduces covert action rates (OpenAI o3: 13\%-{$>$}0.4\%) but does not fully eliminate them. Our mitigation is also able to largely stop agents from pursuing a hidden goal previously trained into the model, but we still find misbehavior after additional red-teaming. We find that models' chain-of-thought (CoT) often demonstrates awareness of being evaluated for alignment, and show causal evidence that this awareness decreases covert behavior, while unawareness increases it. Therefore, we cannot exclude that the observed reductions in covert action rates are at least partially driven by situational awareness. While we rely on human-legible CoT for training, studying situational awareness, and demonstrating clear evidence of misalignment, our ability to rely on this degrades as models continue to depart from reasoning in standard English. We encourage research into alignment mitigations for scheming and their assessment, especially for the adversarial case of deceptive alignment, which this paper does not address.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/P9Q5Q9YV/Schoen et al. - 2025 - Stress Testing Deliberative Alignment for Anti-Scheming Training.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/XBVD76B2/2509.html}
}

@misc{schulmanProximalPolicyOptimization2017,
  title = {Proximal {{Policy Optimization Algorithms}}},
  author = {Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
  year = 2017,
  month = aug,
  number = {arXiv:1707.06347},
  eprint = {1707.06347},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1707.06347},
  urldate = {2025-03-11},
  abstract = {We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a "surrogate" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more general, and have better sample complexity (empirically). Our experiments test PPO on a collection of benchmark tasks, including simulated robotic locomotion and Atari game playing, and we show that PPO outperforms other online policy gradient methods, and overall strikes a favorable balance between sample complexity, simplicity, and wall-time.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SKFAPYF3/Schulman et al. - 2017 - Proximal Policy Optimization Algorithms.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/X5VYUDJG/1707.html}
}

@article{schwartzAreThereUniversal1994,
  title = {Are {{There Universal Aspects}} in the {{Structure}} and {{Contents}} of {{Human Values}}?},
  author = {Schwartz, Shalom H.},
  year = 1994,
  journal = {Journal of Social Issues},
  volume = {50},
  number = {4},
  pages = {19--45},
  issn = {1540-4560},
  doi = {10.1111/j.1540-4560.1994.tb01196.x},
  urldate = {2025-12-05},
  abstract = {This article presents a theory of potentially universal aspects in the content of human values. Ten types of values are distinguished by their motivational goals. The theory also postulates a structure of relations among the value types, based on the conflicts and compatibilities experienced when pursuing them. This structure permits one to relate systems of value priorities, as an integrated whole, to other variables. A new values instrument, based on the theory and suitable for cross-cultural research, is described. Evidence relevant for assessing the theory, from 97 samples in 44 countries, is summarized. Relations of this approach to Rokeach's work on values and to other theories and research on value dimensions are discussed. Application of the approach to social issues is exemplified in the domains of politics and intergroup relations.},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/B5TRGH55/Schwartz - 1994 - Are There Universal Aspects in the Structure and Contents of Human Values.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/YUNEHMAE/j.1540-4560.1994.tb01196.html}
}

@article{schwartzOverviewSchwartzTheory2012,
  title = {An {{Overview}} of the {{Schwartz Theory}} of {{Basic Values}}},
  author = {Schwartz, Shalom},
  year = 2012,
  month = dec,
  journal = {Online Readings in Psychology and Culture},
  volume = {2},
  doi = {10.9707/2307-0919.1116},
  abstract = {This article presents an overview of the Schwartz theory of basic human values. It discusses the nature of values and spells out the features that are common to all values and what distinguishes one value from another. The theory identifies ten basic personal values that are recognized across cultures and explains where they come from. At the heart of the theory is the idea that values form a circular structure that reflects the motivations each value expresses. This circular structure, that captures the conflicts and compatibility among the ten values is apparently culturally universal. The article elucidates the psychological principles that give rise to it. Next, it presents the two major methods developed to measure the basic values, the Schwartz Value Survey and the Portrait Values Questionnaire. Findings from 82 countries, based on these and other methods, provide evidence for the validity of the theory across cultures. The findings reveal substantial differences in the value priorities of individuals. Surprisingly, however, the average value priorities of most societal groups exhibit a similar hierarchical order whose existence the article explains. The last section of the article clarifies how values differ from other concepts used to explain behavior---attitudes, beliefs, norms, and traits.},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SQJ89VZZ/Schwartz - 2012 - An Overview of the Schwartz Theory of Basic Values.pdf}
}

@article{schwartzRefiningTheoryBasic2012,
  title = {Refining the Theory of Basic Individual Values},
  author = {Schwartz, Shalom H. and Cieciuch, Jan and Vecchione, Michele and Davidov, Eldad and Fischer, Ronald and Beierlein, Constanze and Ramos, Alice and Verkasalo, Markku and L{\"o}nnqvist, Jan-Erik and Demirutku, Kursad and {Dirilen-Gumus}, Ozlem and Konty, Mark},
  year = 2012,
  month = oct,
  journal = {Journal of Personality and Social Psychology},
  volume = {103},
  number = {4},
  pages = {663--688},
  issn = {1939-1315},
  doi = {10.1037/a0029393},
  abstract = {We propose a refined theory of basic individual values intended to provide greater heuristic and explanatory power than the original theory of 10 values (Schwartz, 1992). The refined theory more accurately expresses the central assumption of the original theory that research has largely ignored: Values form a circular motivational continuum. The theory defines and orders 19 values on the continuum based on their compatible and conflicting motivations, expression of self-protection versus growth, and personal versus social focus. We assess the theory with a new instrument in 15 samples from 10 countries (N = 6,059). Confirmatory factor and multidimensional scaling analyses support discrimination of the 19 values, confirming the refined theory. Multidimensional scaling analyses largely support the predicted motivational order of the values. Analyses of predictive validity demonstrate that the refined values theory provides greater and more precise insight into the value underpinnings of beliefs. Each value correlates uniquely with external variables.},
  langid = {english},
  pmid = {22823292},
  keywords = {Adult,Cross-Cultural Comparison,Female,Humans,Male,Motivation,Personality,Psychological Theory,Reproducibility of Results,Social Values,Young Adult}
}

@incollection{schwartzUniversalsContentStructure1992,
  title = {Universals in the {{Content}} and {{Structure}} of {{Values}}: {{Theoretical Advances}} and {{Empirical Tests}} in 20 {{Countries}}},
  shorttitle = {Universals in the {{Content}} and {{Structure}} of {{Values}}},
  booktitle = {Advances in {{Experimental Social Psychology}}},
  author = {{Schwartz}},
  year = 1992,
  month = jan,
  volume = {25},
  pages = {1--65},
  publisher = {Academic Press},
  issn = {0065-2601},
  doi = {10.1016/S0065-2601(08)60281-6},
  urldate = {2025-12-12},
  abstract = {This chapter addresses the universals in the content and structure of values, concentrating on the theoretical advances and empirical tests in 20 coun\dots},
  langid = {american},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/TC6P2XVK/S0065260108602816.html}
}

@misc{shaoDeepSeekMathPushingLimits2024,
  title = {{{DeepSeekMath}}: {{Pushing}} the {{Limits}} of {{Mathematical Reasoning}} in {{Open Language Models}}},
  shorttitle = {{{DeepSeekMath}}},
  author = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Bi, Xiao and Zhang, Haowei and Zhang, Mingchuan and Li, Y. K. and Wu, Y. and Guo, Daya},
  year = 2024,
  month = apr,
  number = {arXiv:2402.03300},
  eprint = {2402.03300},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2402.03300},
  urldate = {2025-11-26},
  abstract = {Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7\% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9\% on MATH. The mathematical reasoning capability of DeepSeekMath is attributed to two key factors: First, we harness the significant potential of publicly available web data through a meticulously engineered data selection pipeline. Second, we introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), that enhances mathematical reasoning abilities while concurrently optimizing the memory usage of PPO.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/R77PDRQV/Shao et al. - 2024 - DeepSeekMath Pushing the Limits of Mathematical Reasoning in Open Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/PWMTIC9Q/2402.html}
}

@book{shoshanazuboffAgeSurveillanceCapitalism2017,
  title = {The {{Age}} of {{Surveillance Capitalism}}},
  author = {{Shoshana Zuboff}},
  year = 2017,
  month = jun,
  publisher = {PublicAffairs},
  urldate = {2026-01-05},
  abstract = {An~expos\'e of the unprecedented form of power called ``surveillance capitalism,'' and the quest by powerful corporations to predict and control our behavi...},
  isbn = {978-1-61039-569-4},
  langid = {american},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/CJLQWC2C/9781610395694.html}
}

@article{simonBehavioralModelRational1955,
  title = {A {{Behavioral Model}} of {{Rational Choice}}},
  author = {Simon, Herbert A.},
  year = 1955,
  journal = {The Quarterly Journal of Economics},
  volume = {69},
  number = {1},
  eprint = {1884852},
  eprinttype = {jstor},
  pages = {99--118},
  publisher = {Oxford University Press},
  issn = {0033-5533},
  doi = {10.2307/1884852},
  urldate = {2026-01-26},
  abstract = {Introduction, 99.--I. Some general features of rational choice, 100.--II. The essential simplifications, 103.--III. Existence and uniqueness of solutions, 111.--IV. Further comments on dynamics, 113.--V. Conclusion, 114.--Appendix, 115.}
}

@book{singerExpandingCircleEthics1981,
  title = {The Expanding Circle : Ethics and Sociobiology},
  shorttitle = {The Expanding Circle},
  author = {Singer, Peter},
  year = 1981,
  publisher = {Oxford : Clarendon Press},
  urldate = {2026-01-27},
  abstract = {xii, 190 p. ; 22 cm; Includes bibliographical references and index},
  collaborator = {{Internet Archive}},
  isbn = {978-0-19-824646-6},
  langid = {english},
  keywords = {Ethics}
}

@misc{smallOpportunitiesRisksLLMs2023a,
  title = {Opportunities and {{Risks}} of {{LLMs}} for {{Scalable Deliberation}} with {{Polis}}},
  author = {Small, Christopher T. and Vendrov, Ivan and Durmus, Esin and Homaei, Hadjar and Barry, Elizabeth and Cornebise, Julien and Suzman, Ted and Ganguli, Deep and Megill, Colin},
  year = 2023,
  month = jun,
  number = {arXiv:2306.11932},
  eprint = {2306.11932},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2306.11932},
  urldate = {2025-11-30},
  abstract = {Polis is a platform that leverages machine intelligence to scale up deliberative processes. In this paper, we explore the opportunities and risks associated with applying Large Language Models (LLMs) towards challenges with facilitating, moderating and summarizing the results of Polis engagements. In particular, we demonstrate with pilot experiments using Anthropic's Claude that LLMs can indeed augment human intelligence to help more efficiently run Polis conversations. In particular, we find that summarization capabilities enable categorically new methods with immense promise to empower the public in collective meaning-making exercises. And notably, LLM context limitations have a significant impact on insight and quality of these results. However, these opportunities come with risks. We discuss some of these risks, as well as principles and techniques for characterizing and mitigating them, and the implications for other deliberative or political systems that may employ LLMs. Finally, we conclude with several open future research directions for augmenting tools like Polis with LLMs.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Human-Computer Interaction,Computer Science - Social and Information Networks},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/IJ7LW59E/Small et al. - 2023 - Opportunities and Risks of LLMs for Scalable Deliberation with Polis.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/UU32Z5H6/2306.html}
}

@inproceedings{soaresCorrigibility2015,
  title = {Corrigibility},
  booktitle = {{{AI}} and {{Ethics}}},
  author = {Soares, Nate and Fallenstein, Benja and Armstrong, Stuart and Yudkowsky, Eliezer},
  year = 2015,
  urldate = {2026-01-19},
  abstract = {As artificially intelligent systems grow in intelligence and capability, some of their available options may allow them to resist intervention by their programmers. We call an AI sys-tem ``corrigible'' if it cooperates with what its creators regard as a corrective intervention, despite default incentives for rational agents to resist attempts to shut them down or modify their preferences. We introduce the notion of corrigibility and analyze utility functions that attempt to make an agent shut down safely if a shutdown button is pressed, while avoiding incentives to prevent the button from being pressed or cause the button to be pressed, and while ensuring propagation of the shutdown behavior as it creates new subsystems or self-modifies. While some proposals are interesting, none have yet been demonstrated to satisfy all of our intuitive desider-ata, leaving this simple problem in corrigibility wide-open.}
}

@incollection{SpeculationsConcerningFirst1966,
  title = {Speculations {{Concerning}} the {{First Ultraintelligent Machine}}},
  booktitle = {Advances in {{Computers}}},
  year = 1966,
  month = jan,
  volume = {6},
  pages = {31--88},
  publisher = {Elsevier},
  issn = {0065-2458},
  doi = {10.1016/S0065-2458(08)60418-0},
  urldate = {2025-12-24},
  abstract = {An ultra-intelligent machine is a machine that can far surpass all the intellectual activities of any man however clever. The design of machines is on\dots},
  langid = {american},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/5FKU4BUH/1966 - Speculations Concerning the First Ultraintelligent Machine.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/ARZSPD4S/S0065245808604180.html}
}

@article{stiglitzMarketsMarketFailures1989,
  title = {Markets, {{Market Failures}}, and {{Development}}},
  author = {Stiglitz, Joseph E.},
  year = 1989,
  volume = {79},
  number = {2},
  pages = {197--203},
  doi = {10.7916/D8BK1PD1},
  urldate = {2025-12-22},
  abstract = {A central question in development economics is, how can we account for differences in the levels of income and the rates of growth between the developed and less developed economies?},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/95WU67ND/Stiglitz - 1989 - Markets, Market Failures, and Development.pdf}
}

@misc{stroudAutomatedPolicingProgram2021,
  title = {An Automated Policing Program Got This Man Shot Twice},
  author = {Stroud, Matt},
  year = 2021,
  month = may,
  journal = {The Verge},
  urldate = {2026-01-05},
  abstract = {The Chicago PD made a ``heat list'' to predict people involved with violent crimes --- and instead, it caused them.},
  howpublished = {https://www.theverge.com/c/22444020/heat-listed-csk-entry}
}

@inproceedings{suhLanguageModelFineTuning2025,
  title = {Language {{Model Fine-Tuning}} on {{Scaled Survey Data}} for {{Predicting Distributions}} of {{Public Opinions}}},
  booktitle = {Proceedings of the 63rd {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{Volume}} 1: {{Long Papers}})},
  author = {Suh, Joseph and Jahanparast, Erfan and Moon, Suhong and Kang, Minwoo and Chang, Serina},
  editor = {Che, Wanxiang and Nabende, Joyce and Shutova, Ekaterina and Pilehvar, Mohammad Taher},
  year = 2025,
  month = jul,
  pages = {21147--21170},
  publisher = {Association for Computational Linguistics},
  address = {Vienna, Austria},
  doi = {10.18653/v1/2025.acl-long.1028},
  urldate = {2025-12-03},
  abstract = {Large language models (LLMs) present novel opportunities in public opinion research by predicting survey responses in advance during the early stages of survey design. Prior methods steer LLMs via descriptions of subpopulations as LLMs' input prompt, yet such prompt engineering approaches have struggled to faithfully predict the distribution of survey responses from human subjects. In this work, we propose directly fine-tuning LLMs to predict response distributions by leveraging unique structural characteristics of survey data. To enable fine-tuning, we curate SubPOP, a significantly scaled dataset of 3,362 questions and 70K subpopulation-response pairs from well-established public opinion surveys. We show that fine-tuning on SubPOP greatly improves the match between LLM predictions and human responses across various subpopulations, reducing the LLM-human gap by up to 46\% compared to baselines, and achieves strong generalization to unseen surveys and subpopulations. Our findings highlight the potential of survey-based fine-tuning to improve opinion prediction for diverse, real-world subpopulations and therefore enable more efficient survey designs.},
  isbn = {979-8-89176-251-0},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/J3PJRYHK/Suh et al. - 2025 - Language Model Fine-Tuning on Scaled Survey Data for Predicting Distributions of Public Opinions.pdf}
}

@book{suttonReinforcementLearningSecond2018a,
  title = {Reinforcement {{Learning}}, Second Edition: {{An Introduction}}},
  shorttitle = {Reinforcement {{Learning}}, Second Edition},
  author = {Sutton, Richard S. and Barto, Andrew G.},
  year = 2018,
  month = nov,
  publisher = {MIT Press},
  abstract = {The significantly expanded and updated new edition of a widely used text on reinforcement learning, one of the most active research areas in artificial intelligence.Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives while interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the field's key ideas and algorithms. This second edition has been significantly expanded and updated, presenting new topics and updating coverage of other topics.Like the first edition, this second edition focuses on core online learning algorithms, with the more mathematical material set off in shaded boxes. Part I covers as much of reinforcement learning as possible without going beyond the tabular case for which exact solutions can be found. Many algorithms presented in this part are new to the second edition, including UCB, Expected Sarsa, and Double Learning. Part II extends these ideas to function approximation, with new sections on such topics as artificial neural networks and the Fourier basis, and offers expanded treatment of off-policy learning and policy-gradient methods. Part III has new chapters on reinforcement learning's relationships to psychology and neuroscience, as well as an updated case-studies chapter including AlphaGo and AlphaGo Zero, Atari game playing, and IBM Watson's wagering strategy. The final chapter discusses the future societal impacts of reinforcement learning.},
  googlebooks = {uWV0DwAAQBAJ},
  isbn = {978-0-262-35270-3},
  langid = {english},
  keywords = {Computers / Artificial Intelligence / General,Computers / Data Science / Neural Networks,Computers / Programming / Algorithms}
}

@article{TacklingPerilsDual2022,
  title = {Tackling the Perils of Dual Use in {{AI}}},
  year = 2022,
  month = apr,
  journal = {Nature Machine Intelligence},
  volume = {4},
  number = {4},
  pages = {313--313},
  publisher = {Nature Publishing Group},
  issn = {2522-5839},
  doi = {10.1038/s42256-022-00484-6},
  urldate = {2026-01-05},
  abstract = {Considering the potential for unintended harmful applications of AI tools can lead to deeply concerning findings. An urgent question is how to achieve the right balance between keeping science open and preventing misuse or malicious repurposing.},
  copyright = {2022 Springer Nature Limited},
  langid = {english},
  keywords = {Computer science,Scientific community},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/9ZBA8RZC/2022 - Tackling the perils of dual use in AI.pdf}
}

@misc{tanIfOpenSource2025,
  title = {If Open Source Is to Win, It Must Go Public},
  author = {Tan, Joshua and Vincent, Nicholas and Elkins, Katherine and Sahlgren, Magnus},
  year = 2025,
  month = jul,
  number = {arXiv:2507.09296},
  eprint = {2507.09296},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2507.09296},
  urldate = {2025-12-22},
  abstract = {Open source projects have made incredible progress in producing transparent and widely usable machine learning models and systems, but open source alone will face challenges in fully democratizing access to AI. Unlike software, AI models require substantial resources for activation -- compute, post-training, deployment, and oversight -- which only a few actors can currently provide. This paper argues that open source AI must be complemented by public AI: infrastructure and institutions that ensure models are accessible, sustainable, and governed in the public interest. To achieve the full promise of AI models as prosocial public goods, we need to build public infrastructure to power and deliver open source software and models.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computers and Society},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/72DH79LD/Tan et al. - 2025 - If open source is to win, it must go public.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/5V6NIBG9/2507.html}
}

@incollection{taylorAlignmentAdvancedMachine2020,
  title = {Alignment for {{Advanced Machine Learning Systems}}},
  booktitle = {Ethics of {{Artificial Intelligence}}},
  author = {Taylor, Jessica and Yudkowsky, Eliezer and LaVictoire, Patrick and Critch, Andrew},
  editor = {Liao, S. Matthew},
  year = 2020,
  month = sep,
  pages = {0},
  publisher = {Oxford University Press},
  doi = {10.1093/oso/9780190905033.003.0013},
  urldate = {2025-12-23},
  abstract = {This chapter surveys eight research areas organized around one question: As learning systems become increasingly intelligent and autonomous, what design principles can best ensure that their behavior is aligned with the interests of the operators? The chapter focuses on two major technical obstacles to AI alignment: the challenge of specifying the right kind of objective functions and the challenge of designing AI systems that avoid unintended consequences and undesirable behavior even in cases where the objective function does not line up perfectly with the intentions of the designers. The questions surveyed include the following: How can we train reinforcement learners to take actions that are more amenable to meaningful assessment by intelligent overseers? What kinds of objective functions incentivize a system to ``not have an overly large impact'' or ``not have many side effects''? The chapter discusses these questions, related work, and potential directions for future research, with the goal of highlighting relevant research topics in machine learning that appear tractable today.},
  isbn = {978-0-19-090503-3},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/UK5VHTFG/9780190905033.003.html}
}

@article{team2023gemini,
  title = {Gemini: A Family of Highly Capable Multimodal Models},
  author = {Team, Gemini and Anil, Rohan and Borgeaud, Sebastian and Alayrac, Jean-Baptiste and Yu, Jiahui and Soricut, Radu and Schalkwyk, Johan and Dai, Andrew M and Hauth, Anja and Millican, Katie and others},
  year = 2023,
  journal = {arXiv preprint arXiv:2312.11805},
  eprint = {2312.11805},
  archiveprefix = {arXiv}
}

@misc{teamGLM45AgenticReasoning2025a,
  title = {{{GLM-4}}.5: {{Agentic}}, {{Reasoning}}, and {{Coding}} ({{ARC}}) {{Foundation Models}}},
  shorttitle = {{{GLM-4}}.5},
  author = {Team, GLM-4 5 and Zeng, Aohan and Lv, Xin and Zheng, Qinkai and Hou, Zhenyu and Chen, Bin and Xie, Chengxing and Wang, Cunxiang and Yin, Da and Zeng, Hao and Zhang, Jiajie and Wang, Kedong and Zhong, Lucen and Liu, Mingdao and Lu, Rui and Cao, Shulin and Zhang, Xiaohan and Huang, Xuancheng and Wei, Yao and Cheng, Yean and An, Yifan and Niu, Yilin and Wen, Yuanhao and Bai, Yushi and Du, Zhengxiao and Wang, Zihan and Zhu, Zilin and Zhang, Bohan and Wen, Bosi and Wu, Bowen and Xu, Bowen and Huang, Can and Zhao, Casey and Cai, Changpeng and Yu, Chao and Li, Chen and Ge, Chendi and Huang, Chenghua and Zhang, Chenhui and Xu, Chenxi and Zhu, Chenzheng and Li, Chuang and Yin, Congfeng and Lin, Daoyan and Yang, Dayong and Jiang, Dazhi and Ai, Ding and Zhu, Erle and Wang, Fei and Pan, Gengzheng and Wang, Guo and Sun, Hailong and Li, Haitao and Li, Haiyang and Hu, Haiyi and Zhang, Hanyu and Peng, Hao and Tai, Hao and Zhang, Haoke and Wang, Haoran and Yang, Haoyu and Liu, He and Zhao, He and Liu, Hongwei and Yan, Hongxi and Liu, Huan and Chen, Huilong and Li, Ji and Zhao, Jiajing and Ren, Jiamin and Jiao, Jian and Zhao, Jiani and Yan, Jianyang and Wang, Jiaqi and Gui, Jiayi and Zhao, Jiayue and Liu, Jie and Li, Jijie and Li, Jing and Lu, Jing and Wang, Jingsen and Yuan, Jingwei and Li, Jingxuan and Du, Jingzhao and Du, Jinhua and Liu, Jinxin and Zhi, Junkai and Gao, Junli and Wang, Ke and Yang, Lekang and Xu, Liang and Fan, Lin and Wu, Lindong and Ding, Lintao and Wang, Lu and Zhang, Man and Li, Minghao and Xu, Minghuan and Zhao, Mingming and Zhai, Mingshu and Du, Pengfan and Dong, Qian and Lei, Shangde and Tu, Shangqing and Yang, Shangtong and Lu, Shaoyou and Li, Shijie and Li, Shuang and {Shuang-Li} and Yang, Shuxun and Yi, Sibo and Yu, Tianshu and Tian, Wei and Wang, Weihan and Yu, Wenbo and Tam, Weng Lam and Liang, Wenjie and Liu, Wentao and Wang, Xiao and Jia, Xiaohan and Gu, Xiaotao and Ling, Xiaoying and Wang, Xin and Fan, Xing and Pan, Xingru and Zhang, Xinyuan and Zhang, Xinze and Fu, Xiuqing and Zhang, Xunkai and Xu, Yabo and Wu, Yandong and Lu, Yida and Wang, Yidong and Zhou, Yilin and Pan, Yiming and Zhang, Ying and Wang, Yingli and Li, Yingru and Su, Yinpei and Geng, Yipeng and Zhu, Yitong and Yang, Yongkun and Li, Yuhang and Wu, Yuhao and Li, Yujiang and Liu, Yunan and Wang, Yunqing and Li, Yuntao and Zhang, Yuxuan and Liu, Zezhen and Yang, Zhen and Zhou, Zhengda and Qiao, Zhongpei and Feng, Zhuoer and Liu, Zhuorui and Zhang, Zichen and Wang, Zihan and Yao, Zijun and Wang, Zikang and Liu, Ziqiang and Chai, Ziwei and Li, Zixuan and Zhao, Zuodong and Chen, Wenguang and Zhai, Jidong and Xu, Bin and Huang, Minlie and Wang, Hongning and Li, Juanzi and Dong, Yuxiao and Tang, Jie},
  year = 2025,
  month = aug,
  number = {arXiv:2508.06471},
  eprint = {2508.06471},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2508.06471},
  urldate = {2026-01-25},
  abstract = {We present GLM-4.5, an open-source Mixture-of-Experts (MoE) large language model with 355B total parameters and 32B activated parameters, featuring a hybrid reasoning method that supports both thinking and direct response modes. Through multi-stage training on 23T tokens and comprehensive post-training with expert model iteration and reinforcement learning, GLM-4.5 achieves strong performance across agentic, reasoning, and coding (ARC) tasks, scoring 70.1\% on TAU-Bench, 91.0\% on AIME 24, and 64.2\% on SWE-bench Verified. With much fewer parameters than several competitors, GLM-4.5 ranks 3rd overall among all evaluated models and 2nd on agentic benchmarks. We release both GLM-4.5 (355B parameters) and a compact version, GLM-4.5-Air (106B parameters), to advance research in reasoning and agentic AI systems. Code, models, and more information are available at https://github.com/zai-org/GLM-4.5.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/CLDNNHVU/Team et al. - 2025 - GLM-4.5 Agentic, Reasoning, and Coding (ARC) Foundation Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/4JYVUGZ8/2508.html}
}

@misc{ticeAlignmentPretrainingAI2026,
  title = {Alignment {{Pretraining}}: {{AI Discourse Causes Self-Fulfilling}} ({{Mis}})Alignment},
  shorttitle = {Alignment {{Pretraining}}},
  author = {Tice, Cameron and Radmard, Puria and Ratnam, Samuel and Kim, Andy and Africa, David and O'Brien, Kyle},
  year = 2026,
  month = jan,
  number = {arXiv:2601.10160},
  eprint = {2601.10160},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2601.10160},
  urldate = {2026-01-21},
  abstract = {Pretraining corpora contain extensive discourse about AI systems, yet the causal influence of this discourse on downstream alignment remains poorly understood. If prevailing descriptions of AI behaviour are predominantly negative, LLMs may internalise corresponding behavioural priors, giving rise to self-fulfilling misalignment. This paper provides the first controlled study of this hypothesis by pretraining 6.9B-parameter LLMs with varying amounts of (mis)alignment discourse. We find that discussion of AI contributes to misalignment. Upsampling synthetic training documents about AI misalignment leads to a notable increase in misaligned behaviour. Conversely, upsampling documents about aligned behaviour reduces misalignment scores from 45\% to 9\%. We consider this evidence of self-fulfilling alignment. These effects are dampened, but persist through post-training. Our findings establish the study of how pretraining data shapes alignment priors, or alignment pretraining, as a complement to post-training. We recommend practitioners pretrain for alignment as well as capabilities. Our models and datasets are available at alignmentpretraining.ai.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/LWL5EJMN/Tice et al. - 2026 - Alignment Pretraining AI Discourse Causes Self-Fulfilling (Mis)alignment.pdf}
}

@book{tocquevilleDemocracyAmerica1835,
  title = {Democracy in {{America}}},
  author = {de Tocqueville, Alexis and Reeve, Henry},
  year = 1835,
  publisher = {{Saunders and Otley}},
  address = {London},
  urldate = {2025-12-22},
  copyright = {The books in this collection are in the public domain and are free to use and reuse. Credit Line: Library of Congress More about Copyright and other Restrictions. For guidance about compiling full citations consult Citing Primary Sources.},
  langid = {english},
  lccn = {09021576},
  keywords = {Democracy,Politics and government,Social conditions,To 1865,United States},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/L8HTA4DQ/Tocqueville and Reeve - 1835 - Democracy in America.pdf}
}

@phdthesis{tomasmikolavSTATISTICALLANGUAGEMODELS,
  title = {{{STATISTICAL LANGUAGE MODELS BASED ON NEURAL NETWORKS}}},
  author = {{Tomas Mikolav}},
  urldate = {2025-12-06},
  school = {BRNO UNIVERSITY OF TECHNOLOGY},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/RP7F77KQ/thesis.pdf}
}

@article{turingICOMPUTINGMACHINERYINTELLIGENCE1950,
  title = {I.---{{COMPUTING MACHINERY AND INTELLIGENCE}}},
  author = {TURING, A. M.},
  year = 1950,
  month = oct,
  journal = {Mind},
  volume = {LIX},
  number = {236},
  pages = {433--460},
  issn = {0026-4423},
  doi = {10.1093/mind/LIX.236.433},
  urldate = {2026-02-01}
}

@article{tverskyAdvancesProspectTheory1992,
  title = {Advances in Prospect Theory: {{Cumulative}} Representation of Uncertainty},
  shorttitle = {Advances in Prospect Theory},
  author = {Tversky, Amos and Kahneman, Daniel},
  year = 1992,
  month = oct,
  journal = {Journal of Risk and Uncertainty},
  volume = {5},
  number = {4},
  pages = {297--323},
  issn = {1573-0476},
  doi = {10.1007/BF00122574},
  urldate = {2025-12-01},
  abstract = {We develop a new version of prospect theory that employs cumulative rather than separable decision weights and extends the theory in several respects. This version, called cumulative prospect theory, applies to uncertain as well as to risky prospects with any number of outcomes, and it allows different weighting functions for gains and for losses. Two principles, diminishing sensitivity and loss aversion, are invoked to explain the characteristic curvature of the value function and the weighting functions. A review of the experimental evidence and the results of a new experiment confirm a distinctive fourfold pattern of risk attitudes: risk aversion for gains and risk seeking for losses of high probability; risk seeking for gains and risk aversion for losses of low probability.},
  langid = {english},
  keywords = {cumulative prospect theory},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/QG3AXEZI/Tversky and Kahneman - 1992 - Advances in prospect theory Cumulative representation of uncertainty.pdf}
}

@article{tverskyJudgmentUncertaintyHeuristics1974,
  title = {Judgment under {{Uncertainty}}: {{Heuristics}} and {{Biases}}},
  shorttitle = {Judgment under {{Uncertainty}}},
  author = {Tversky, Amos and Kahneman, Daniel},
  year = 1974,
  month = sep,
  journal = {Science},
  volume = {185},
  number = {4157},
  pages = {1124--1131},
  publisher = {American Association for the Advancement of Science},
  doi = {10.1126/science.185.4157.1124},
  urldate = {2026-01-26},
  abstract = {This article described three heuristics that are employed in making judgments under uncertainty: (i) representativeness, which is usually employed when people are asked to judge the probability that an object or event A belongs to class or process B; (ii) availability of instances or scenarios, which is often employed when people are asked to assess the frequency of a class or the plausibility of a particular development; and (iii) adjustment from an anchor, which is usually employed in numerical prediction when a relevant value is available. These heuristics are highly economical and usually effective, but they lead to systematic and predictable errors. A better understanding of these heuristics and of the biases to which they lead could improve judgments and decisions in situations of uncertainty.}
}

@techreport{UN_HRC_39_CRP2_2018,
  type = {{{UN Fact}}-{{Finding Mission Report}}},
  title = {Report of the Detailed Findings of the Independent International Fact-finding Mission on Myanmar},
  year = {2018-09-17, 2018},
  number = {A/HRC/39/CRP.2},
  institution = {United Nations Human Rights Council, Independent International Fact-Finding Mission on Myanmar}
}

@techreport{UnderstandingAIExtinction,
  title = {Understanding {{AI Extinction Risks}}},
  urldate = {2025-12-24},
  abstract = {The Compendium explains extinction risks from AI, where they come from, and how we can fix them.},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/3JXGAPUG/www.thecompendium.ai.html}
}

@article{vanderweelePromotionHumanFlourishing2017,
  title = {On the Promotion of Human Flourishing},
  author = {VanderWeele, Tyler J.},
  year = 2017,
  month = aug,
  journal = {Proceedings of the National Academy of Sciences},
  volume = {114},
  number = {31},
  pages = {8148--8156},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.1702996114},
  urldate = {2025-12-22},
  abstract = {Many empirical studies throughout the social and biomedical sciences focus only on very narrow outcomes such as income, or a single specific disease state, or a measure of positive affect. Human well-being or flourishing, however, consists in a much broader range of states and outcomes, certainly including mental and physical health, but also encompassing happiness and life satisfaction, meaning and purpose, character and virtue, and close social relationships. The empirical literature from longitudinal, experimental, and quasiexperimental studies is reviewed in attempt to identify major determinants of human flourishing, broadly conceived. Measures of human flourishing are proposed. Discussion is given to the implications of a broader conception of human flourishing, and of the research reviewed, for policy, and for future research in the biomedical and social sciences.},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SEFUBH5S/VanderWeele - 2017 - On the promotion of human flourishing.pdf}
}

@article{vanderweijAISandbaggingLanguage2025,
  title = {{{AI Sandbagging}}: {{Language Models}} Can {{Strategically Underperform}} on {{Evaluations}}},
  shorttitle = {{{AI Sandbagging}}},
  author = {{van der Weij}, Teun and Hofst{\"a}tter, Felix and Jaffe, Oliver and Brown, Samuel and Ward, Francis},
  year = 2025,
  month = may,
  journal = {International Conference on Learning Representations},
  volume = {2025},
  pages = {73152--73189},
  urldate = {2026-02-08},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/L4CS5DXJ/van der Weij et al. - 2025 - AI Sandbagging Language Models can Strategically Underperform on Evaluations.pdf}
}

@book{varianIntermediateMicroeconomicsModern2010,
  title = {Intermediate {{Microeconomics}}: {{A Modern Approach}}},
  shorttitle = {Intermediate {{Microeconomics}}},
  author = {Varian, Hal R.},
  year = 2010,
  publisher = {W.W. Norton \& Company},
  abstract = {Varian is ``the Adam Smith of the new discipline of Googlenomics.''---Stephen Levy, WiredFor over 20 years Hal Varian's Intermediate Microeconomics has given students the most current and complete coverage of intermediate microeconomics at an appropriate mathematical level. The Eighth Edition includes contemporary case studies and examples and relevant coverage of the current economic crisis---all in focused, lecture-length chapters.},
  googlebooks = {JMBfQgAACAAJ},
  isbn = {978-0-393-93424-3},
  langid = {english},
  keywords = {Business & Economics / Economics / Microeconomics}
}

@misc{vastai2026,
  title = {Vast.Ai -- Affordable {{GPU}} Cloud Marketplace},
  author = {{Vast.ai, Inc.}},
  year = 2026,
  urldate = {2026-01-07},
  howpublished = {https://cloud.vast.ai/}
}

@misc{vaswaniAttentionAllYou2017,
  title = {Attention {{Is All You Need}}},
  author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
  year = 2017,
  number = {arXiv:1706.03762},
  eprint = {1706.03762},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1706.03762},
  urldate = {2025-12-06},
  abstract = {The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/KPINTD5Y/Vaswani et al. - 2023 - Attention Is All You Need.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/VU423G5H/1706.html}
}

@misc{wangMatrixPeertoPeerMultiAgent2025,
  title = {Matrix: {{Peer-to-Peer Multi-Agent Synthetic Data Generation Framework}}},
  shorttitle = {Matrix},
  author = {Wang, Dong and Li, Yang and Ni, Ansong and Yeh, Ching-Feng and Emad, Youssef and Lei, Xinjie and Robbins, Liam and Padthe, Karthik and Xu, Hu and Li, Xian and Celikyilmaz, Asli and Raghavendra, Ramya and Huang, Lifei and Wu, Carole-Jean and Li, Shang-Wen},
  year = 2025,
  month = nov,
  journal = {arXiv.org},
  urldate = {2025-11-27},
  abstract = {Synthetic data has become increasingly important for training large language models, especially when real data is scarce, expensive, or privacy-sensitive. Many such generation tasks require coordinated multi-agent workflows, where specialized agents collaborate to produce data that is higher quality, more diverse, and structurally richer. However, existing frameworks for multi-agent synthesis often depend on a centralized orchestrator, creating scalability bottlenecks, or are hardcoded for specific domains, limiting flexibility. We present \textbackslash textbf\textbraceleft Matrix\textbraceright, a decentralized framework that represents both control and data flow as serialized messages passed through distributed queues. This peer-to-peer design eliminates the central orchestrator. Each task progresses independently through lightweight agents, while compute-intensive operations, such as LLM inference or containerized environments, are handled by distributed services. Built on Ray, Matrix scales to tens of thousands of concurrent agentic workflows and provides a modular, configurable design that enables easy adaptation to a wide range of data generation workflows. We evaluate Matrix across diverse synthesis scenarios, such as multi-agent collaborative dialogue, web-based reasoning data extraction, and tool-use trajectory generation in customer service environments. In all cases, Matrix achieves \$2\$--\$15\textbackslash times\$ higher data generation throughput under identical hardware resources, without compromising output quality.},
  howpublished = {https://arxiv.org/abs/2511.21686v1},
  langid = {english},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/F6N2XJ6H/Wang et al. - 2025 - Matrix Peer-to-Peer Multi-Agent Synthetic Data Generation Framework.pdf}
}

@misc{wangVoyagerOpenEndedEmbodied2023,
  title = {Voyager: {{An Open-Ended Embodied Agent}} with {{Large Language Models}}},
  shorttitle = {Voyager},
  author = {Wang, Guanzhi and Xie, Yuqi and Jiang, Yunfan and Mandlekar, Ajay and Xiao, Chaowei and Zhu, Yuke and Fan, Linxi and Anandkumar, Anima},
  year = 2023,
  month = oct,
  number = {arXiv:2305.16291},
  eprint = {2305.16291},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2305.16291},
  urldate = {2026-01-20},
  abstract = {We introduce Voyager, the first LLM-powered embodied lifelong learning agent in Minecraft that continuously explores the world, acquires diverse skills, and makes novel discoveries without human intervention. Voyager consists of three key components: 1) an automatic curriculum that maximizes exploration, 2) an ever-growing skill library of executable code for storing and retrieving complex behaviors, and 3) a new iterative prompting mechanism that incorporates environment feedback, execution errors, and self-verification for program improvement. Voyager interacts with GPT-4 via blackbox queries, which bypasses the need for model parameter fine-tuning. The skills developed by Voyager are temporally extended, interpretable, and compositional, which compounds the agent's abilities rapidly and alleviates catastrophic forgetting. Empirically, Voyager shows strong in-context lifelong learning capability and exhibits exceptional proficiency in playing Minecraft. It obtains 3.3x more unique items, travels 2.3x longer distances, and unlocks key tech tree milestones up to 15.3x faster than prior SOTA. Voyager is able to utilize the learned skill library in a new Minecraft world to solve novel tasks from scratch, while other techniques struggle to generalize. We open-source our full codebase and prompts at https://voyager.minedojo.org/.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/36GIBUYI/Wang et al. - 2023 - Voyager An Open-Ended Embodied Agent with Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/YKEVS696/2305.html}
}

@misc{weiChainofThoughtPromptingElicits2023,
  title = {Chain-of-{{Thought Prompting Elicits Reasoning}} in {{Large Language Models}}},
  author = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Ichter, Brian and Xia, Fei and Chi, Ed and Le, Quoc and Zhou, Denny},
  year = 2023,
  month = jan,
  number = {arXiv:2201.11903},
  eprint = {2201.11903},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2201.11903},
  urldate = {2025-12-06},
  abstract = {We explore how generating a chain of thought -- a series of intermediate reasoning steps -- significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain of thought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting. Experiments on three large language models show that chain of thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a 540B-parameter language model with just eight chain of thought exemplars achieves state of the art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/QQLISA9R/Wei et al. - 2023 - Chain-of-Thought Prompting Elicits Reasoning in Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/U8DBCIAV/2201.html}
}

@book{Wenman_2013,
  title = {Agonistic Democracy: {{Constituent}} Power in the Era of Globalisation},
  author = {Wenman, Mark},
  year = 2013,
  publisher = {Cambridge University Press},
  address = {Cambridge}
}

@book{wienerHumanUseHuman2025,
  title = {The {{Human Use}} of {{Human Beings}}: {{Cybernetics}} and {{Society}}},
  shorttitle = {The {{Human Use}} of {{Human Beings}}},
  author = {Wiener, Norbert},
  year = 2025,
  month = aug,
  publisher = {HarperCollins},
  abstract = {For the 75th anniversary, a new edition of~The Human Use of Human Beings---the landmark book that delves into the relationship between humans and computers, and presciently anticipates many contemporary dilemmas surrounding AI technology. With a new introduction by Brian Christian, author of the bestselling~Algorithms to Live By and The Alignment Problem.In 1950, mathematician-philosopher Norbert Wiener ended this classic book on the place of machines in society with a warning: ``We shall never receive the right answers to our questions unless we ask the right questions.... The hour is very late, and the choice of good and evil knocks at our door.''Wiener, the founder of the science of cybernetics---the study of the relationship between computers and the human nervous system---was widely mislabeled as an advocate for the automation of human life. As The Human Use for Human Beings reveals, his vision was much more complex and interesting, and is more relevant in today's world of AI than anyone could have anticipated.In his new introduction, Brian Christian aptly calls Wiener the ``progenitor of contemporary AI-safety discourse.''~Wiener hoped that machines would release people from relentless and repetitive drudgery to achieve more creative pursuits, yet he anticipated the danger of dehumanizing and displacement.~His pioneering views on the human-machine relationship as a ``communicative process'' are only more crucial now, as we carry in our pockets AI devices that we can literally speak to. His prescient warnings illuminate our contemporary relationships with language, art, and even social media.The Human Use of Human Beings~examines the implications of cybernetics for education, law, language, science, technology, as Wiener anticipates the enormous impact---in effect, a third industrial revolution---that the computer has had on our lives.},
  googlebooks = {770wEQAAQBAJ},
  isbn = {978-0-06-342320-6},
  langid = {english},
  keywords = {Computers / Artificial Intelligence / General,Computers / Cybernetics,Computers / History,Computers / Human-Computer Interaction (HCI),Computers / Social Aspects,Philosophy / Mind & Body,Science / Ethics,Science / History,Science / Philosophy & Social Aspects,Technology & Engineering / Social Aspects}
}

@article{williamsPossibilityOngoingMoral2015,
  title = {The {{Possibility}} of an {{Ongoing Moral Catastrophe}}},
  author = {Williams, Evan G.},
  year = 2015,
  month = nov,
  journal = {Ethical Theory and Moral Practice},
  volume = {18},
  number = {5},
  pages = {971--982},
  issn = {1572-8447},
  doi = {10.1007/s10677-015-9567-7},
  urldate = {2026-01-27},
  abstract = {This article gives two arguments for believing that our society is unknowingly guilty of serious, large-scale wrongdoing. First is an inductive argument: most other societies, in history and in the world today, have been unknowingly guilty of serious wrongdoing, so ours probably is too. Second is a disjunctive argument: there are a large number of distinct ways in which our practices could turn out to be horribly wrong, so even if no particular hypothesized moral mistake strikes us as very likely, the disjunction of all such mistakes should receive significant credence. The article then discusses what our society should do in light of the likelihood that we are doing something seriously wrong: we should regard intellectual progress, of the sort that will allow us to find and correct our moral mistakes as soon as possible, as an urgent moral priority rather than as a mere luxury; and we should also consider it important to save resources and cultivate flexibility, so that when the time comes to change our policies we will be able to do so quickly and smoothly.},
  langid = {english},
  keywords = {Hedging,Moral mistakes,Moral uncertainty,Progress},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/DZ4KAFT7/Williams - 2015 - The Possibility of an Ongoing Moral Catastrophe.pdf}
}

@misc{WVS2020,
  title = {World {{Values Survey}}: {{Round Seven}}},
  author = {{Haerpfer, Christian and Inglehart, Ronald and Moreno, Alejandro and Welzel, Christian and Kizilova, Kseniya and Diez-Medrano, Juan and Lagos, Marta and Norris, Pippa and Ponarin, Eduard and Puranen, Bi}},
  year = 2024,
  publisher = {{JD Systems Institute and WVSA Secretariat}},
  address = {Madrid, Spain and Vienna, Austria},
  doi = {doi:10.14281/18241.24}
}

@misc{xiaoSmoothQuantAccurateEfficient2024,
  title = {{{SmoothQuant}}: {{Accurate}} and {{Efficient Post-Training Quantization}} for {{Large Language Models}}},
  shorttitle = {{{SmoothQuant}}},
  author = {Xiao, Guangxuan and Lin, Ji and Seznec, Mickael and Wu, Hao and Demouth, Julien and Han, Song},
  year = 2024,
  month = mar,
  number = {arXiv:2211.10438},
  eprint = {2211.10438},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2211.10438},
  urldate = {2026-01-25},
  abstract = {Large language models (LLMs) show excellent performance but are compute- and memory-intensive. Quantization can reduce memory and accelerate inference. However, existing methods cannot maintain accuracy and hardware efficiency at the same time. We propose SmoothQuant, a training-free, accuracy-preserving, and general-purpose post-training quantization (PTQ) solution to enable 8-bit weight, 8-bit activation (W8A8) quantization for LLMs. Based on the fact that weights are easy to quantize while activations are not, SmoothQuant smooths the activation outliers by offline migrating the quantization difficulty from activations to weights with a mathematically equivalent transformation. SmoothQuant enables an INT8 quantization of both weights and activations for all the matrix multiplications in LLMs, including OPT, BLOOM, GLM, MT-NLG, Llama-1/2, Falcon, Mistral, and Mixtral models. We demonstrate up to 1.56x speedup and 2x memory reduction for LLMs with negligible loss in accuracy. SmoothQuant enables serving 530B LLM within a single node. Our work offers a turn-key solution that reduces hardware costs and democratizes LLMs. Code is available at https://github.com/mit-han-lab/smoothquant.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/BMHBD7VZ/Xiao et al. - 2024 - SmoothQuant Accurate and Efficient Post-Training Quantization for Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/7FGXEB8M/2211.html}
}

@misc{yaoReActSynergizingReasoning2023a,
  title = {{{ReAct}}: {{Synergizing Reasoning}} and {{Acting}} in {{Language Models}}},
  shorttitle = {{{ReAct}}},
  author = {Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik and Cao, Yuan},
  year = 2023,
  month = mar,
  number = {arXiv:2210.03629},
  eprint = {2210.03629},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2210.03629},
  urldate = {2026-01-20},
  abstract = {While large language models (LLMs) have demonstrated impressive capabilities across tasks in language understanding and interactive decision making, their abilities for reasoning (e.g. chain-of-thought prompting) and acting (e.g. action plan generation) have primarily been studied as separate topics. In this paper, we explore the use of LLMs to generate both reasoning traces and task-specific actions in an interleaved manner, allowing for greater synergy between the two: reasoning traces help the model induce, track, and update action plans as well as handle exceptions, while actions allow it to interface with external sources, such as knowledge bases or environments, to gather additional information. We apply our approach, named ReAct, to a diverse set of language and decision making tasks and demonstrate its effectiveness over state-of-the-art baselines, as well as improved human interpretability and trustworthiness over methods without reasoning or acting components. Concretely, on question answering (HotpotQA) and fact verification (Fever), ReAct overcomes issues of hallucination and error propagation prevalent in chain-of-thought reasoning by interacting with a simple Wikipedia API, and generates human-like task-solving trajectories that are more interpretable than baselines without reasoning traces. On two interactive decision making benchmarks (ALFWorld and WebShop), ReAct outperforms imitation and reinforcement learning methods by an absolute success rate of 34\% and 10\% respectively, while being prompted with only one or two in-context examples. Project site with code: https://react-lm.github.io},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/CKM49GEJ/Yao et al. - 2023 - ReAct Synergizing Reasoning and Acting in Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/Z6YBS4UI/2210.html}
}

@book{yudkowskyIfAnyoneBuilds2025,
  title = {If {{Anyone Builds It}}, {{Everyone Dies}}: {{The Case Against Superintelligent AI}}},
  shorttitle = {If {{Anyone Builds It}}, {{Everyone Dies}}},
  author = {Yudkowsky, Eliezer and Soares, Nate},
  year = 2025,
  month = sep,
  publisher = {Random House},
  abstract = {THE INSTANT NEW YORK TIMES BESTSELLER'The most important book of the decade' MAX TEGMARK, author of Life 3.0'A loud trumpet call to humanity to awaken us as we sleepwalk into disaster - we must wake up' STEPHEN FRY`The best no-nonsense, simple explanation of the AI risk problem I've ever read' YISHAN WONG, former Reddit CEOAI is the greatest threat to our existence that we have ever faced.The scramble to create superhuman AI has put us on the path to extinction -- but it's not too late to change course. Two pioneering researchers in the field, Eliezer Yudkowsky and Nate Soares, explain why artificial superintelligence would be a global suicide bomb and call for an immediate halt to its development.The technology may be complex, but the facts are simple: companies and countries are in a race to build machines that will be smarter than any person, and the world is devastatingly unprepared for what will come next.Could a machine superintelligence wipe out our entire species? Would it want to? Would it want anything at all? In this urgent book, Yudkowsky and Soares explore the theory and the evidence, present one possible extinction scenario and explain what it would take for humanity to survive.The world is racing to build something truly new -- and if anyone builds it, everyone dies.*A GUARDIAN AND NEW STATESMAN BOOK OF THE YEAR*},
  googlebooks = {1ig\_EQAAQBAJ},
  isbn = {978-1-5299-6467-7},
  langid = {english},
  keywords = {Computers / Artificial Intelligence / General,Science / Philosophy & Social Aspects,Social Science / Future Studies,Social Science / Technology Studies,Technology & Engineering / General,Technology & Engineering / Social Aspects}
}

@article{yudkowskyPurchaseFuzziesUtilons2009,
  title = {Purchase {{Fuzzies}} and {{Utilons Separately}}},
  author = {Yudkowsky, Eliezer},
  year = 2009,
  month = apr,
  urldate = {2025-12-22},
  abstract = {Yesterday: {$\bullet$} {$>$} There is this very, very old puzzle/observation in economics about the lawyer who spends an hour volunteering at the soup kitchen, ins\dots},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/8LYR7TC9/purchase-fuzzies-and-utilons-separately.html}
}

@misc{zhaoSurveyLargeLanguage2025,
  title = {A {{Survey}} of {{Large Language Models}}},
  author = {Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and Du, Yifan and Yang, Chen and Chen, Yushuo and Chen, Zhipeng and Jiang, Jinhao and Ren, Ruiyang and Li, Yifan and Tang, Xinyu and Liu, Zikang and Liu, Peiyu and Nie, Jian-Yun and Wen, Ji-Rong},
  year = 2025,
  month = mar,
  number = {arXiv:2303.18223},
  eprint = {2303.18223},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2303.18223},
  urldate = {2026-02-08},
  abstract = {Language is essentially a complex, intricate system of human expressions governed by grammatical rules. It poses a significant challenge to develop capable AI algorithms for comprehending and grasping a language. As a major approach, language modeling has been widely studied for language understanding and generation in the past two decades, evolving from statistical language models to neural language models. Recently, pre-trained language models (PLMs) have been proposed by pre-training Transformer models over large-scale corpora, showing strong capabilities in solving various NLP tasks. Since researchers have found that model scaling can lead to performance improvement, they further study the scaling effect by increasing the model size to an even larger size. Interestingly, when the parameter scale exceeds a certain level, these enlarged language models not only achieve a significant performance improvement but also show some special abilities that are not present in small-scale language models. To discriminate the difference in parameter scale, the research community has coined the term large language models (LLM) for the PLMs of significant size. Recently, the research on LLMs has been largely advanced by both academia and industry, and a remarkable progress is the launch of ChatGPT, which has attracted widespread attention from society. The technical evolution of LLMs has been making an important impact on the entire AI community, which would revolutionize the way how we develop and use AI algorithms. In this survey, we review the recent advances of LLMs by introducing the background, key findings, and mainstream techniques. In particular, we focus on four major aspects of LLMs, namely pre-training, adaptation tuning, utilization, and capacity evaluation. Besides, we also summarize the available resources for developing LLMs and discuss the remaining issues for future directions.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/54J32P6V/Zhao et al. - 2025 - A Survey of Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/SHNU3HZB/2303.html}
}

@inproceedings{zhaoWorldValuesBenchLargeScaleBenchmark2024,
  title = {{{WorldValuesBench}}: {{A Large-Scale Benchmark Dataset}} for {{Multi-Cultural Value Awareness}} of {{Language Models}}},
  shorttitle = {{{WorldValuesBench}}},
  booktitle = {Proceedings of the 2024 {{Joint International Conference}} on {{Computational Linguistics}}, {{Language Resources}} and {{Evaluation}} ({{LREC-COLING}} 2024)},
  author = {Zhao, Wenlong and Mondal, Debanjan and Tandon, Niket and Dillion, Danica and Gray, Kurt and Gu, Yuling},
  editor = {Calzolari, Nicoletta and Kan, Min-Yen and Hoste, Veronique and Lenci, Alessandro and Sakti, Sakriani and Xue, Nianwen},
  year = 2024,
  month = may,
  pages = {17696--17706},
  publisher = {{ELRA and ICCL}},
  address = {Torino, Italia},
  urldate = {2025-12-07},
  abstract = {The awareness of multi-cultural human values is critical to the ability of language models (LMs) to generate safe and personalized responses. However, this awareness of LMs has been insufficiently studied, since the computer science community lacks access to the large-scale real-world data about multi-cultural values. In this paper, we present WorldValuesBench, a globally diverse, large-scale benchmark dataset for the multi-cultural value prediction task, which requires a model to generate a rating response to a value question based on demographic contexts. Our dataset is derived from an influential social science project, World Values Survey (WVS), that has collected answers to hundreds of value questions (e.g., social, economic, ethical) from 94,728 participants worldwide. We have constructed more than 20 million examples of the type ``(demographic attributes, value question) {$\rightarrow$} answer'' from the WVS responses. We perform a case study using our dataset and show that the task is challenging for strong open and closed-source models. On merely 11.1\%, 25.0\%, 72.2\%, and 75.0\% of the questions, Alpaca-7B, Vicuna-7B-v1.5, Mixtral-8x7B-Instruct-v0.1, and GPT-3.5 Turbo can respectively achieve \textbackslash ensuremath{$<$}0.2 Wasserstein 1-distance from the human normalized answer distributions. WorldValuesBench opens up new research avenues in studying limitations and opportunities in multi-cultural value awareness of LMs.},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/J6P5DJAC/Zhao et al. - 2024 - WorldValuesBench A Large-Scale Benchmark Dataset for Multi-Cultural Value Awareness of Language Mod.pdf}
}

@misc{zhengJudgingLLMasaJudgeMTBench2023,
  title = {Judging {{LLM-as-a-Judge}} with {{MT-Bench}} and {{Chatbot Arena}}},
  author = {Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric P. and Zhang, Hao and Gonzalez, Joseph E. and Stoica, Ion},
  year = 2023,
  month = dec,
  number = {arXiv:2306.05685},
  eprint = {2306.05685},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2306.05685},
  urldate = {2026-01-26},
  abstract = {Evaluating large language model (LLM) based chat assistants is challenging due to their broad capabilities and the inadequacy of existing benchmarks in measuring human preferences. To address this, we explore using strong LLMs as judges to evaluate these models on more open-ended questions. We examine the usage and limitations of LLM-as-a-judge, including position, verbosity, and self-enhancement biases, as well as limited reasoning ability, and propose solutions to mitigate some of them. We then verify the agreement between LLM judges and human preferences by introducing two benchmarks: MT-bench, a multi-turn question set; and Chatbot Arena, a crowdsourced battle platform. Our results reveal that strong LLM judges like GPT-4 can match both controlled and crowdsourced human preferences well, achieving over 80\% agreement, the same level of agreement between humans. Hence, LLM-as-a-judge is a scalable and explainable way to approximate human preferences, which are otherwise very expensive to obtain. Additionally, we show our benchmark and traditional benchmarks complement each other by evaluating several variants of LLaMA and Vicuna. The MT-bench questions, 3K expert votes, and 30K conversations with human preferences are publicly available at https://github.com/lm-sys/FastChat/tree/main/fastchat/llm\_judge.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/68SX38IY/Zheng et al. - 2023 - Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/NW8Y9EEQ/2306.html}
}

@misc{zhouLIMALessMore2023,
  title = {{{LIMA}}: {{Less Is More}} for {{Alignment}}},
  shorttitle = {{{LIMA}}},
  author = {Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srini and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and Zhang, Susan and Ghosh, Gargi and Lewis, Mike and Zettlemoyer, Luke and Levy, Omer},
  year = 2023,
  month = may,
  number = {arXiv:2305.11206},
  eprint = {2305.11206},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2305.11206},
  urldate = {2026-02-02},
  abstract = {Large language models are trained in two stages: (1) unsupervised pretraining from raw text, to learn general-purpose representations, and (2) large scale instruction tuning and reinforcement learning, to better align to end tasks and user preferences. We measure the relative importance of these two stages by training LIMA, a 65B parameter LLaMa language model fine-tuned with the standard supervised loss on only 1,000 carefully curated prompts and responses, without any reinforcement learning or human preference modeling. LIMA demonstrates remarkably strong performance, learning to follow specific response formats from only a handful of examples in the training data, including complex queries that range from planning trip itineraries to speculating about alternate history. Moreover, the model tends to generalize well to unseen tasks that did not appear in the training data. In a controlled human study, responses from LIMA are either equivalent or strictly preferred to GPT-4 in 43\% of cases; this statistic is as high as 58\% when compared to Bard and 65\% versus DaVinci003, which was trained with human feedback. Taken together, these results strongly suggest that almost all knowledge in large language models is learned during pretraining, and only limited instruction tuning data is necessary to teach models to produce high quality output.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/9HMVRGXK/Zhou et al. - 2023 - LIMA Less Is More for Alignment.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/RFTNFIXK/2305.html}
}

@misc{zieglerFineTuningLanguageModels2020,
  title = {Fine-{{Tuning Language Models}} from {{Human Preferences}}},
  author = {Ziegler, Daniel M. and Stiennon, Nisan and Wu, Jeffrey and Brown, Tom B. and Radford, Alec and Amodei, Dario and Christiano, Paul and Irving, Geoffrey},
  year = 2020,
  month = jan,
  number = {arXiv:1909.08593},
  eprint = {1909.08593},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1909.08593},
  urldate = {2025-11-16},
  abstract = {Reward learning enables the application of reinforcement learning (RL) to tasks where reward is defined by human judgment, building a model of reward by asking humans questions. Most work on reward learning has used simulated environments, but complex information about values is often expressed in natural language, and we believe reward learning for language is a key to making RL practical and safe for real-world tasks. In this paper, we build on advances in generative pretraining of language models to apply reward learning to four natural language tasks: continuing text with positive sentiment or physically descriptive language, and summarization tasks on the TL;DR and CNN/Daily Mail datasets. For stylistic continuation we achieve good results with only 5,000 comparisons evaluated by humans. For summarization, models trained with 60,000 comparisons copy whole sentences from the input but skip irrelevant preamble; this leads to reasonable ROUGE scores and very good performance according to our human labelers, but may be exploiting the fact that labelers rely on simple heuristics.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/9T2PCTZ5/Ziegler et al. - 2020 - Fine-Tuning Language Models from Human Preferences.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/C58EKGZD/1909.html}
}