diff --git a/data/xml/2025.aimecon.xml b/data/xml/2025.aimecon.xml
new file mode 100644
index 0000000000..700684a66b
--- /dev/null
+++ b/data/xml/2025.aimecon.xml
@@ -0,0 +1,981 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2025.aimecon">
+  <volume id="main" ingest-date="2025-10-26" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Full Papers</booktitle>
+      <editor><first>Joshua</first><last>Wilson</last></editor>
+      <editor><first>Christopher</first><last>Ormerod</last></editor>
+      <editor><first>Magdalen</first><last>Beiting Parrish</last></editor>
+      <publisher>National Council on Measurement in Education (NCME)</publisher>
+      <address>Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States</address>
+      <month>October</month>
+      <year>2025</year>
+      <url hash="02248671">2025.aimecon-main</url>
+      <venue>aimecon</venue>
+      <isbn>979-8-218-84228-4</isbn>
+    </meta>
+    <frontmatter>
+      <url hash="45dcb173">2025.aimecon-main.0</url>
+      <bibkey>aime-con-2025-main</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Input Optimization for Automated Scoring in Reading Assessment</title>
+      <author><first>Ji Yoon</first><last>Jung</last><affiliation>Boston College</affiliation></author>
+      <author><first>Ummugul</first><last>Bezirhan</last><affiliation>Boston College</affiliation></author>
+      <author><first>Matthias</first><last>von Davier</last><affiliation>Boston College</affiliation></author>
+      <pages>1-8</pages>
+      <abstract>This study examines input optimization for enhanced efficiency in automated scoring (AS) of reading assessments, which typically involve lengthy passages and complex scoring guides. We propose optimizing input size using question-specific summaries and simplified scoring guides. Findings indicate that input optimization via compression is achievable while maintaining AS performance.</abstract>
+      <url hash="85f18bec">2025.aimecon-main.1</url>
+      <bibkey>jung-etal-2025-input</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Implementation Considerations for Automated <fixed-case>AI</fixed-case> Grading of Student Work</title>
+      <author><first>Zewei</first><last>Tian</last></author>
+      <author><first>Alex</first><last>Liu</last><affiliation>University of Washington</affiliation></author>
+      <author><first>Lief</first><last>Esbenshade</last><affiliation>University of Washington</affiliation></author>
+      <author><first>Shawon</first><last>Sarkar</last><affiliation>University of Washington</affiliation></author>
+      <author><first>Zachary</first><last>Zhang</last><affiliation>Hensun Innovation</affiliation></author>
+      <author><first>Kevin</first><last>He</last><affiliation>Hensun Innovation</affiliation></author>
+      <author><first>Min</first><last>Sun</last><affiliation>University of Washington</affiliation></author>
+      <pages>9-20</pages>
+      <abstract>19 K-12 teachers participated in a co-design pilot study of an AI education platform, testing assessment grading. Teachers valued AI’s rapid narrative feedback for formative assessment but distrusted automated scoring, preferring human oversight. Students appreciated immediate feedback but remained skeptical of AI-only grading, highlighting needs for trustworthy, teacher-centered AI tools.</abstract>
+      <url hash="be3db3e1">2025.aimecon-main.2</url>
+      <bibkey>tian-etal-2025-implementation</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Compare Several Supervised Machine Learning Methods in Detecting Aberrant Response Pattern</title>
+      <author><first>Yi</first><last>Lu</last><affiliation>Federation of State Boards of Physical Therapy</affiliation></author>
+      <author><first>Yu</first><last>Zhang</last><affiliation>The Federation of State Boards of Physical Therapy</affiliation></author>
+      <author><first>Lorin</first><last>Mueller</last><affiliation>Federation of State Boards of Physical Therapy</affiliation></author>
+      <pages>21-24</pages>
+      <abstract>An aberrant response pattern, e.g., a test taker is able to answer difficult questions correctly, but is unable to answer easy questions correctly, are first identified lz and lz*. We then compared the performance of five supervised machine learning methods in detecting aberrant response pattern identified by lz or lz*.</abstract>
+      <url hash="820fd233">2025.aimecon-main.3</url>
+      <bibkey>lu-etal-2025-compare</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Leveraging multi-<fixed-case>AI</fixed-case> agents for a teacher co-design</title>
+      <author><first>Hongwen</first><last>Guo</last><affiliation>ETS Research Institute</affiliation></author>
+      <author><first>Matthew S.</first><last>Johnson</last><affiliation>ETS Research Institute</affiliation></author>
+      <author><first>Luis</first><last>Saldivia</last><affiliation>ETS</affiliation></author>
+      <author><first>Michelle</first><last>Worthington</last><affiliation>ETS</affiliation></author>
+      <author><first>Kadriye</first><last>Ercikan</last><affiliation>ETS</affiliation></author>
+      <pages>25-34</pages>
+      <abstract>This study uses multi-AI agents to accelerate teacher co-design efforts. It innovatively links student profiles obtained from numerical assessment data to AI agents in natural languages. The AI agents simulate human inquiry, enrich feedback and ground it in teachers’ knowledge and practice, showing significant potential for transforming assessment practice and research.</abstract>
+      <url hash="37d69548">2025.aimecon-main.4</url>
+      <bibkey>guo-etal-2025-leveraging</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Long context Automated Essay Scoring with Language Models</title>
+      <author><first>Christopher</first><last>Ormerod</last><affiliation>Cambium Assessment</affiliation></author>
+      <author><first>Gitit</first><last>Kehat</last><affiliation>Cambium Assessment</affiliation></author>
+      <pages>35-42</pages>
+      <abstract>In this study, we evaluate several models that incorporate architectural modifications to overcome the length limitations of the standard transformer architecture using the Kaggle ASAP 2.0 dataset. The models considered in this study include fine-tuned versions of XLNet, Longformer, ModernBERT, Mamba, and Llama models.</abstract>
+      <url hash="abd9c8f8">2025.aimecon-main.5</url>
+      <bibkey>ormerod-kehat-2025-long</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Optimizing Reliability Scoring for <fixed-case>ILSA</fixed-case>s</title>
+      <author><first>Ji Yoon</first><last>Jung</last><affiliation>Boston College</affiliation></author>
+      <author><first>Ummugul</first><last>Bezirhan</last><affiliation>Boston College</affiliation></author>
+      <author><first>Matthias</first><last>von Davier</last><affiliation>Boston College</affiliation></author>
+      <pages>43-49</pages>
+      <abstract>This study proposes an innovative method for evaluating cross-country scoring reliability (CCSR) in multilingual assessments, using hyperparameter optimization and a similarity-based weighted majority scoring within a single human scoring framework. Results show that this approach provides a cost-effective and comprehensive assessment of CCSR without the need for additional raters.</abstract>
+      <url hash="d9935d59">2025.aimecon-main.6</url>
+      <bibkey>jung-etal-2025-optimizing</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Exploring <fixed-case>AI</fixed-case>-Enabled Test Practice, Affect, and Test Outcomes in Language Assessment</title>
+      <author><first>Jill</first><last>Burstein</last><affiliation>Duolingo</affiliation></author>
+      <author><first>Ramsey</first><last>Cardwell</last><affiliation>Duolingo</affiliation></author>
+      <author><first>Ping-Lin</first><last>Chuang</last><affiliation>Duolingo</affiliation></author>
+      <author><first>Allison</first><last>Michalowski</last><affiliation>Duolingo</affiliation></author>
+      <author><first>Steven</first><last>Nydick</last><affiliation>Duolingo</affiliation></author>
+      <pages>50-57</pages>
+      <abstract>We analyzed data from 25,969 test takers of a high-stakes, computer-adaptive English proficiency test to examine relationships between repeated use of AI-generated practice tests and performance, affect, and score-sharing behavior. Taking 1–3 practice tests was associated with higher scores and confidence, while higher usage showed different engagement and outcome</abstract>
+      <url hash="73c89296">2025.aimecon-main.7</url>
+      <bibkey>burstein-etal-2025-exploring</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Develop a Generic Essay Scorer for Practice Writing Tests of Statewide Assessments</title>
+      <author><first>Yi</first><last>Gui</last><affiliation>The University of Iowa</affiliation></author>
+      <pages>58-81</pages>
+      <abstract>This study examines whether NLP transfer learning techniques, specifically BERT, can be used to develop prompt-generic AES models for practice writing tests. Findings reveal that fine-tuned DistilBERT, without further pre-training, achieves high agreement (QWK ≈ 0.89), enabling scalable, robust AES models in statewide K-12 assessments without costly supplementary pre-training.</abstract>
+      <url hash="ed4dc41d">2025.aimecon-main.8</url>
+      <bibkey>gui-2025-develop</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Towards assessing persistence in reading in young learners using pedagogical agents</title>
+      <author><first>Caitlin</first><last>Tenison</last><affiliation>ETS</affiliation></author>
+      <author><first>Beata</first><last>Beigman Kelbanov</last><affiliation>ETS</affiliation></author>
+      <author><first>Noah</first><last>Schroeder</last><affiliation>University of Florida</affiliation></author>
+      <author><first>Shan</first><last>Zhang</last><affiliation>University of Florida</affiliation></author>
+      <author><first>Michael</first><last>Suhan</last><affiliation>Educational Testing Service</affiliation></author>
+      <author><first>Chuyang</first><last>Zhang</last><affiliation>University of Florida</affiliation></author>
+      <pages>82-90</pages>
+      <abstract>This pilot study investigated the use of a pedagogical agent to administer a conversational survey to second graders following a digital reading activity, measuring comprehension, persistence, and enjoyment. Analysis of survey responses and behavioral log data provide evidence for recommendations for the design of agent-mediated assessment in early literacy.</abstract>
+      <url hash="dd959279">2025.aimecon-main.9</url>
+      <bibkey>tenison-etal-2025-towards</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>LLM</fixed-case>-Based Approaches for Detecting Gaming the System in Self-Explanation</title>
+      <author><first>Jiayi (Joyce)</first><last>Zhang</last><affiliation>University of Pennsylvania</affiliation></author>
+      <author><first>Ryan S.</first><last>Baker</last><affiliation>University of Pennsylvania</affiliation></author>
+      <author><first>Bruce M.</first><last>McLaren</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <pages>91-98</pages>
+      <abstract>This study compares two LLM-based approaches for detecting gaming behavior in students’ open-ended responses within a math digital learning game. The sentence embedding method outperformed the prompt-based approach and was more conservative. Consistent with prior research, gaming correlated negatively with learning, highlighting LLMs’ potential to detect disengagement in open-ended tasks.</abstract>
+      <url hash="ccbb1d8b">2025.aimecon-main.10</url>
+      <bibkey>zhang-etal-2025-llm-based</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Evaluating the Impact of <fixed-case>LLM</fixed-case>-guided Reflection on Learning Outcomes with Interactive <fixed-case>AI</fixed-case>-Generated Educational Podcasts</title>
+      <author><first>Vishnu</first><last>Menon</last><affiliation>Drexel University</affiliation></author>
+      <author><first>Andy</first><last>Cherney</last><affiliation>Drexel University</affiliation></author>
+      <author><first>Elizabeth B.</first><last>Cloude</last><affiliation>Michigan State University</affiliation></author>
+      <author id="li-zhang-aws"><first>Li</first><last>Zhang</last><affiliation>Drexel University</affiliation></author>
+      <author><first>Tiffany Diem</first><last>Do</last><affiliation>Drexel University</affiliation></author>
+      <pages>99-106</pages>
+      <abstract>This study examined whether embedding LLM-guided reflection prompts in an interactive AI-generated podcast improved learning and user experience compared to a version without prompts. Thirty-six undergraduates participated, and while learning outcomes were similar across conditions, reflection prompts reduced perceived attractiveness, highlighting a call for more research on reflective interactivity design.</abstract>
+      <url hash="f8f99a71">2025.aimecon-main.11</url>
+      <bibkey>menon-etal-2025-evaluating</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Generative <fixed-case>AI</fixed-case> in the K–12 Formative Assessment Process: Enhancing Feedback in the Classroom</title>
+      <author><first>Mike Thomas</first><last>Maksimchuk</last><affiliation>Kent Intermediate School District</affiliation></author>
+      <author><first>Edward</first><last>Roeber</last><affiliation>Michigan Assessment Consortium</affiliation></author>
+      <author><first>Davie</first><last>Store</last><affiliation>Kent Intermediate School District</affiliation></author>
+      <pages>107-110</pages>
+      <abstract>This paper explores how generative AI can enhance K–12 formative assessment by improving feedback, supporting task design, fostering student metacognition, and building teacher assessment literacy. It addresses challenges of equity, ethics, and implementation, offering practical strategies and case studies to guide responsible AI integration in classroom formative assessment practices.</abstract>
+      <url hash="4e2f940c">2025.aimecon-main.12</url>
+      <bibkey>maksimchuk-etal-2025-generative</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Using Large Language Models to Analyze Students’ Collaborative Argumentation in Classroom Discussions</title>
+      <author><first>Nhat</first><last>Tran</last><affiliation>University of Pittsburgh</affiliation></author>
+      <author><first>Diane</first><last>Litman</last></author>
+      <author><first>Amanda</first><last>Godley</last><affiliation>University of Pittsburgh</affiliation></author>
+      <pages>111-125</pages>
+      <abstract>Collaborative argumentation enables students to build disciplinary knowledge and to think in disciplinary ways. We use Large Language Models (LLMs) to improve existing methods for collaboration classification and argument identification. Results suggest that LLMs are effective for both tasks and should be considered as a strong baseline for future research.</abstract>
+      <url hash="46b6badb">2025.aimecon-main.13</url>
+      <bibkey>tran-etal-2025-using</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Evaluating Generative <fixed-case>AI</fixed-case> as a Mentor Resource: Bias and Implementation Challenges</title>
+      <author><first>Jimin</first><last>Lee</last><affiliation>Clark University</affiliation></author>
+      <author><first>Alena G</first><last>Esposito</last><affiliation>Clark University</affiliation></author>
+      <pages>126-133</pages>
+      <abstract>We explored how students’ perceptions of helpfulness and caring skew their ability to identify AI versus human mentorship responses. Emotionally resonant responses often lead to misattributions, indicating perceptual biases that shape mentorship judgments. The findings inform ethical, relational, and effective integration of AI in student support.</abstract>
+      <url hash="4fcd4382">2025.aimecon-main.14</url>
+      <bibkey>lee-esposito-2025-evaluating</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>AI</fixed-case>-Based Classification of <fixed-case>TIMSS</fixed-case> Items for Framework Alignment</title>
+      <author><first>Ummugul</first><last>Bezirhan</last><affiliation>Boston College</affiliation></author>
+      <author><first>Matthias</first><last>von Davier</last><affiliation>Boston College</affiliation></author>
+      <pages>134-141</pages>
+      <abstract>Large-scale assessments rely on expert panels to verify that test items align with prescribed frameworks, a labor-intensive process. This study evaluates the use of GPT-4o to classify TIMSS items to content domain, cognitive domain, and difficulty categories. Findings highlight the potential of language models to support scalable, framework-aligned item verification.</abstract>
+      <url hash="f5d29793">2025.aimecon-main.15</url>
+      <bibkey>bezirhan-von-davier-2025-ai</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Towards Reliable Generation of Clinical Chart Items: A Counterfactual Reasoning Approach with Large Language Models</title>
+      <author><first>Jiaxuan</first><last>Li</last><affiliation>University of California Irvine</affiliation></author>
+      <author><first>Saed</first><last>Rezayi</last><affiliation>NBME</affiliation></author>
+      <author><first>Peter</first><last>Baldwin</last><affiliation>National Board of Medical Examiners</affiliation></author>
+      <author><first>Polina</first><last>Harik</last><affiliation>NBME</affiliation></author>
+      <author><first>Victoria</first><last>Yaneva</last><affiliation>National Board of Medical Examiners</affiliation></author>
+      <pages>142-153</pages>
+      <abstract>This study explores GPT-4 for generating clinical chart items in medical education using three prompting strategies. Expert evaluations found many items usable or promising. The counterfactual approach enhanced novelty, and item quality improved with high-surprisal examples. This is the first investigation of LLMs for automated clinical chart item generation.</abstract>
+      <url hash="cdd83c67">2025.aimecon-main.16</url>
+      <bibkey>li-etal-2025-towards-reliable</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Using Whisper Embeddings for Audio-Only Latent Token Classification of Classroom Management Practices</title>
+      <author><first>Wesley Griffith</first><last>Morris</last></author>
+      <author><first>Jessica</first><last>Vitale</last><affiliation>Vanderbilt University</affiliation></author>
+      <author><first>Isabel</first><last>Arvelo</last><affiliation>Vanderbilt University</affiliation></author>
+      <pages>154-162</pages>
+      <abstract>In this study, we developed a textless NLP system using a fine-tuned Whisper encoder to identify classroom management practices from noisy classroom recordings. The model segments teacher speech from non-teacher speech and performs multi-label classification of classroom practices, achieving acceptable accuracy without requiring transcript generation.</abstract>
+      <url hash="9c45f65f">2025.aimecon-main.17</url>
+      <bibkey>morris-etal-2025-using</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Comparative Study of Double Scoring Design for Measuring Mathematical Quality of Instruction</title>
+      <author><first>Jonathan Kyle</first><last>Foster</last><affiliation>University at Albany</affiliation></author>
+      <author><first>James</first><last>Drimalla</last><affiliation>University of Virginia</affiliation></author>
+      <author><first>Nursultan</first><last>Japashov</last><affiliation>University at Albany</affiliation></author>
+      <pages>163-171</pages>
+      <abstract>The integration of automated scoring and addressing whether it might meet the extensive need for double scoring in classroom observation systems is the focus of this study. We outline an accessible approach for determining the interchangeability of automated systems within comparative scoring design studies.</abstract>
+      <url hash="3d3dd94f">2025.aimecon-main.18</url>
+      <bibkey>foster-etal-2025-comparative</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Toward Automated Evaluation of <fixed-case>AI</fixed-case>-Generated Item Drafts in Clinical Assessment</title>
+      <author><first>Tazin</first><last>Afrin</last><affiliation>NBME</affiliation></author>
+      <author><first>Le An</first><last>Ha</last><affiliation>Ho Chi Minh City University of Foreign Languages and Information Technology</affiliation></author>
+      <author><first>Victoria</first><last>Yaneva</last><affiliation>National Board of Medical Examiners</affiliation></author>
+      <author><first>Keelan</first><last>Evanini</last><affiliation>NBME</affiliation></author>
+      <author><first>Steven</first><last>Go</last><affiliation>NBME</affiliation></author>
+      <author><first>Kristine</first><last>DeRuchie</last><affiliation>NBME</affiliation></author>
+      <author><first>Michael</first><last>Heilig</last><affiliation>NBME</affiliation></author>
+      <pages>172-182</pages>
+      <abstract>This study examines the classification of AI-generated clinical multiple-choice questions drafts as “helpful” or “non-helpful” starting points. Expert judgments were analyzed, and multiple classifiers were evaluated—including feature-based models, fine-tuned transformers, and few-shot prompting with GPT-4. Our findings highlight the challenges and considerations for evaluation methods of AI-generated items in clinical test development.</abstract>
+      <url hash="14548aac">2025.aimecon-main.19</url>
+      <bibkey>afrin-etal-2025-toward</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Numeric Information in Elementary School Texts Generated by <fixed-case>LLM</fixed-case>s vs Human Experts</title>
+      <author><first>Anastasia</first><last>Smirnova</last><affiliation>San Francisco State University</affiliation></author>
+      <author><first>Erin S.</first><last>Lee</last><affiliation>University of California, Berkeley</affiliation></author>
+      <author><first>Shiying</first><last>Li</last><affiliation>San Francisco State University</affiliation></author>
+      <pages>183-191</pages>
+      <abstract>We analyze GPT-4o’s ability to represent numeric information in texts for elementary school children and assess it with respect to the human baseline. We show that both humans and GPT-4o reduce the amount of numeric information when adapting informational texts for children but GPT-4o retains more complex numeric types than humans do.</abstract>
+      <url hash="144cc8b6">2025.aimecon-main.20</url>
+      <bibkey>smirnova-etal-2025-numeric</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Towards evaluating teacher discourse without task-specific fine-tuning data</title>
+      <author><first>Beata</first><last>Beigman Klebanov</last></author>
+      <author><first>Michael</first><last>Suhan</last><affiliation>Educational Testing Service</affiliation></author>
+      <author><first>Jamie N.</first><last>Mikeska</last><affiliation>ETS</affiliation></author>
+      <pages>192-200</pages>
+      <abstract>Teaching simulations with feedback are one way to provide teachers with practice opportunities to help improve their skill. We investigated methods to build evaluation models of teacher performance in leading a discussion in a simulated classroom, particularly for tasks with little performance data.</abstract>
+      <url hash="45502156">2025.aimecon-main.21</url>
+      <bibkey>beigman-klebanov-etal-2025-towards</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Linguistic proficiency of humans and <fixed-case>LLM</fixed-case>s in <fixed-case>J</fixed-case>apanese: Effects of task demands and content</title>
+      <author><first>May Lynn</first><last>Reese</last><affiliation>San Francisco State University</affiliation></author>
+      <author><first>Anastasia</first><last>Smirnova</last><affiliation>San Francisco State University</affiliation></author>
+      <pages>201-211</pages>
+      <abstract>We evaluate linguistic proficiency of humans and LLMs on pronoun resolution in Japanese, using the Winograd Schema Challenge dataset. Humans outperform LLMs in the baseline condition, but we find evidence for task demand effectss in both humans and LLMs. We also found that LLMs surpass human performance in scenarios referencing US culture, providing strong evidence for content effects.</abstract>
+      <url hash="3e302c9e">2025.aimecon-main.22</url>
+      <bibkey>reese-smirnova-2025-linguistic</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Generative <fixed-case>AI</fixed-case> Teaching Simulations as Formative Assessment Tools within Preservice Teacher Preparation</title>
+      <author><first>Jamie N.</first><last>Mikeska</last><affiliation>ETS</affiliation></author>
+      <author><first>Aakanksha</first><last>Bhatia</last><affiliation>ExcelOne</affiliation></author>
+      <author><first>Shreyashi</first><last>Halder</last><affiliation>ETS</affiliation></author>
+      <author><first>Tricia</first><last>Maxwell</last><affiliation>ETS</affiliation></author>
+      <author><first>Beata</first><last>Beigman Klebanov</last></author>
+      <author><first>Benny</first><last>Longwill</last><affiliation>ETS</affiliation></author>
+      <author><first>Kashish</first><last>Behl</last><affiliation>ETS</affiliation></author>
+      <author><first>Calli</first><last>Shekell</last><affiliation>Thiel University</affiliation></author>
+      <pages>212-220</pages>
+      <abstract>This paper examines how generative AI (GenAI) teaching simulations can be used as a formative assessment tool to gain insight into elementary preservice teachers’ (PSTs’) instructional abilities. This study investigated the teaching moves PSTs used to elicit student thinking in a GenAI simulation and their perceptions of the simulation’s</abstract>
+      <url hash="3af608ac">2025.aimecon-main.23</url>
+      <bibkey>mikeska-etal-2025-generative</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Using <fixed-case>LLM</fixed-case>s to identify features of personal and professional skills in an open-response situational judgment test</title>
+      <author><first>Cole</first><last>Walsh</last><affiliation>Acuity Insights</affiliation></author>
+      <author><first>Rodica</first><last>Ivan</last><affiliation>Acuity Insights</affiliation></author>
+      <author><first>Muhammad Zafar</first><last>Iqbal</last><affiliation>Acuity Insights</affiliation></author>
+      <author><first>Colleen</first><last>Robb</last><affiliation>Acuity Insights</affiliation></author>
+      <pages>221-230</pages>
+      <abstract>Current methods for assessing personal and professional skills lack scalability due to reliance on human raters, while NLP-based systems for assessing these skills fail to demonstrate construct validity. This study introduces a new method utilizing LLMs to extract construct-relevant features from responses to an assessment of personal and professional skills.</abstract>
+      <url hash="198e6bbd">2025.aimecon-main.24</url>
+      <bibkey>walsh-etal-2025-using</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Automated Evaluation of Standardized Patients with <fixed-case>LLM</fixed-case>s</title>
+      <author><first>Andrew</first><last>Emerson</last><affiliation>National Board of Medical Examiners</affiliation></author>
+      <author><first>Le An</first><last>Ha</last><affiliation>Ho Chi Minh City University of Foreign Languages and Information Technology</affiliation></author>
+      <author><first>Keelan</first><last>Evanini</last><affiliation>NBME</affiliation></author>
+      <author><first>Su</first><last>Somay</last><affiliation>National Board of Medical Examiners</affiliation></author>
+      <author><first>Kevin</first><last>Frome</last><affiliation>National Board of Medical Examiners</affiliation></author>
+      <author><first>Polina</first><last>Harik</last><affiliation>NBME</affiliation></author>
+      <author><first>Victoria</first><last>Yaneva</last><affiliation>National Board of Medical Examiners</affiliation></author>
+      <pages>231-238</pages>
+      <abstract>Standardized patients (SPs) are essential for clinical reasoning assessments in medical education. This paper introduces evaluation metrics that apply to both human and simulated SP systems. The metrics are computed using two LLM-as-a-judge approaches that align with human evaluators on SP performance, enabling scalable formative clinical reasoning assessments.</abstract>
+      <url hash="1b5c84ff">2025.aimecon-main.25</url>
+      <bibkey>emerson-etal-2025-automated</bibkey>
+    </paper>
+    <paper id="26">
+      <title><fixed-case>LLM</fixed-case>-Human Alignment in Evaluating Teacher Questioning Practices: Beyond Ratings to Explanation</title>
+      <author><first>Ruikun</first><last>Hou</last><affiliation>Technical University of Munich</affiliation></author>
+      <author><first>Tim</first><last>Fütterer</last><affiliation>University of Tübingen</affiliation></author>
+      <author><first>Babette</first><last>Bühler</last><affiliation>Technical University of Munich</affiliation></author>
+      <author><first>Patrick</first><last>Schreyer</last><affiliation>University of Kassel</affiliation></author>
+      <author><first>Peter</first><last>Gerjets</last><affiliation>Leibniz-Institut für Wissensmedien</affiliation></author>
+      <author><first>Ulrich</first><last>Trautwein</last><affiliation>University of Tübingen</affiliation></author>
+      <author><first>Enkelejda</first><last>Kasneci</last><affiliation>Technical University of Munich</affiliation></author>
+      <pages>239-249</pages>
+      <abstract>This study investigates the alignment between large language models (LLMs) and human raters in assessing teacher questioning practices, moving beyond rating agreement to the evidence selected to justify their decisions. Findings highlight LLMs’ potential to support large-scale classroom observation through interpretable, evidence-based scoring, with possible implications for concrete teacher feedback.</abstract>
+      <url hash="ba71e248">2025.aimecon-main.26</url>
+      <bibkey>hou-etal-2025-llm</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Leveraging Fine-tuned Large Language Models in Item Parameter Prediction</title>
+      <author><first>Suhwa</first><last>Han</last><affiliation>Cambium Asessment</affiliation></author>
+      <author><first>Frank</first><last>Rijmen</last><affiliation>Cambium Assessment</affiliation></author>
+      <author><first>Allison Ames</first><last>Boykin</last><affiliation>NBME</affiliation></author>
+      <author><first>Susan</first><last>Lottridge</last><affiliation>Cambium Assessment</affiliation></author>
+      <pages>250-264</pages>
+      <abstract>The study introduces novel approaches for fine-tuning pre-trained LLMs to predict item response theory parameters directly from item texts and structured item attribute variables. The proposed methods were evaluated on a dataset over 1,000 English Language Art items that are currently in the operational pool for a large scale assessment.</abstract>
+      <url hash="5b83ea60">2025.aimecon-main.27</url>
+      <bibkey>han-etal-2025-leveraging-fine</bibkey>
+    </paper>
+    <paper id="28">
+      <title>How Model Size, Temperature, and Prompt Style Affect <fixed-case>LLM</fixed-case>-Human Assessment Score Alignment</title>
+      <author><first>Julie</first><last>Jung</last><affiliation>Harvard University</affiliation></author>
+      <author><first>Max</first><last>Lu</last><affiliation>Harvard University</affiliation></author>
+      <author><first>Sina Chole</first><last>Benker</last><affiliation>University of Münster</affiliation></author>
+      <author><first>Dogus</first><last>Darici</last><affiliation>University of Münster</affiliation></author>
+      <pages>265-273</pages>
+      <abstract>We examined how model size, temperature, and prompt style affect Large Language Models’ (LLMs) alignment with human raters in assessing clinical reasoning skills. Model size emerged as a key factor in LLM-human score alignment. Findings reveal both the potential for scalable LLM-raters and the risks of relying on them exclusively.</abstract>
+      <url hash="452cd5c7">2025.aimecon-main.28</url>
+      <bibkey>jung-etal-2025-model</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Assessing <fixed-case>AI</fixed-case> skills: A washback point of view</title>
+      <author><first>Meirav</first><last>Arieli-Attali</last><affiliation>Fordham University</affiliation></author>
+      <author><first>Beata</first><last>Beigman Klebanov</last></author>
+      <author><first>Tenaha</first><last>O’Reilly</last></author>
+      <author><first>Diego</first><last>Zapata-Rivera</last><affiliation>ETS</affiliation></author>
+      <author><first>Tami</first><last>Sabag-Shushan</last><affiliation>National Authority for Testing and Evaluation, Israel</affiliation></author>
+      <author><first>Iman</first><last>Awadie</last><affiliation>National Authority for Testing and Evaluation, Israel</affiliation></author>
+      <pages>274-280</pages>
+      <abstract>The emerging dominance of AI in the perception of skills-of-the-future makes assessing AI skills necessary to help guide learning. Creating an assessment of AI skills poses some new challenges. We examine those from the point of view of washback, and exemplify using two exploration studies conducted with 9th grade students.</abstract>
+      <url hash="b55c83c3">2025.aimecon-main.29</url>
+      <bibkey>arieli-attali-etal-2025-assessing</bibkey>
+    </paper>
+    <paper id="30">
+      <title>Using Generative <fixed-case>AI</fixed-case> to Develop a Common Metric in Item Response Theory</title>
+      <author><first>Peter</first><last>Baldwin</last><affiliation>National Board of Medical Examiners</affiliation></author>
+      <pages>281-289</pages>
+      <abstract>We propose a method for linking independently calibrated item response theory (IRT) scales using large language models to generate shared parameter estimates across forms. Applied to medical licensure data, the approach reliably recovers slope values across all conditions and yields accurate intercepts when cross-form differences in item difficulty are small.</abstract>
+      <url hash="da719bc1">2025.aimecon-main.30</url>
+      <bibkey>baldwin-2025-using</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Augmented Measurement Framework for Dynamic Validity and Reciprocal Human-<fixed-case>AI</fixed-case> Collaboration in Assessment</title>
+      <author><first>Taiwo</first><last>Feyijimi</last><affiliation>University of Georgia</affiliation></author>
+      <author><first>Daniel O</first><last>Oyeniran</last><affiliation>The University of Alabama</affiliation></author>
+      <author><first>Oukayode</first><last>Apata</last><affiliation>Texas A&amp;M University</affiliation></author>
+      <author><first>Henry Sanmi</first><last>Makinde</last></author>
+      <author><first>Hope Oluwaseun</first><last>Adegoke</last><affiliation>University of North Carolina, Greensboro</affiliation></author>
+      <author><first>John</first><last>Ajamobe</last><affiliation>Texas A&amp;M University</affiliation></author>
+      <author><first>Justice</first><last>Dadzie</last><affiliation>The University of Alabama</affiliation></author>
+      <pages>290-296</pages>
+      <abstract>The proliferation of Generative Artificial Intelligence presents unprecedented opportunities and profound challenges for educational measurement. This study introduces the Augmented Measurement Framework grounded in four core principles. The paper discussed practical applications, implications for professional development and policy, and charts a research agenda for advancing this framework in educational measurement.</abstract>
+      <url hash="2110aed1">2025.aimecon-main.31</url>
+      <bibkey>feyijimi-etal-2025-augmented</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Patterns of Inquiry, Scaffolding, and Interaction Profiles in Learner-<fixed-case>AI</fixed-case> Collaborative Math Problem-Solving</title>
+      <author><first>Zilong</first><last>Pan</last><affiliation>Lehigh University</affiliation></author>
+      <author><first>Shen</first><last>Ba</last><affiliation>The Education University of Hong Kong</affiliation></author>
+      <author><first>Zilu</first><last>Jiang</last><affiliation>Johns Hopkins University</affiliation></author>
+      <author><first>Chenglu</first><last>Li</last><affiliation>University of Utah</affiliation></author>
+      <pages>297-305</pages>
+      <abstract>This study investigates inquiry and scaffolding patterns between students and MathPal, a math AI agent, during problem-solving tasks. Using qualitative coding, lag sequential analysis, and Epistemic Network Analysis, the study identifies distinct interaction profiles, revealing how personalized AI feedback shapes student learning behaviors and inquiry dynamics in mathematics problem-solving activities.</abstract>
+      <url hash="eeea5f60">2025.aimecon-main.32</url>
+      <bibkey>pan-etal-2025-patterns</bibkey>
+    </paper>
+    <paper id="33">
+      <title>Pre-trained Transformer Models for Standard-to-Standard Alignment Study</title>
+      <author><first>Hye-Jeong</first><last>Choi</last><affiliation>HumRRO</affiliation></author>
+      <author><first>Reese</first><last>Butterfuss</last><affiliation>Centriverse</affiliation></author>
+      <author><first>Meng</first><last>Fan</last><affiliation>HumRRO</affiliation></author>
+      <pages>306-311</pages>
+      <abstract>The current study evaluated the accuracy of five pre-trained large language models (LLMs) in matching human judgment for standard-to-standard alignment study. Results demonstrated comparable performance LLMs across despite differences in scale and computational demands. Additionally, incorporating domain labels as auxiliary information did not enhance LLMs performance. These findings provide initial evidence for the viability of open-source LLMs to facilitate alignment study and offer insights into the utility of auxiliary information.</abstract>
+      <url hash="87cdac89">2025.aimecon-main.33</url>
+      <bibkey>choi-etal-2025-pre</bibkey>
+    </paper>
+    <paper id="34">
+      <title>From Entropy to Generalizability: Strengthening Automated Essay Scoring Reliability and Sustainability</title>
+      <author><first>Yi</first><last>Gui</last><affiliation>The University of Iowa</affiliation></author>
+      <pages>312-328</pages>
+      <abstract>Generalizability Theory with entropy-derived stratification optimized automated essay scoring reliability. A G-study decomposed variance across 14 encoders and 3 seeds; D-studies identified minimal ensembles achieving G ≥ 0.85. A hybrid of one medium and one small encoder with two seeds maximized dependability per compute cost. Stratification ensured uniform precision across</abstract>
+      <url hash="ba7863c7">2025.aimecon-main.34</url>
+      <bibkey>gui-2025-entropy</bibkey>
+    </paper>
+    <paper id="35">
+      <title>Undergraduate Students’ Appraisals and Rationales of <fixed-case>AI</fixed-case> Fairness in Higher Education</title>
+      <author><first>Victoria</first><last>Delaney</last><affiliation>San Diego State University</affiliation></author>
+      <author><first>Sunday</first><last>Stein</last><affiliation>San Diego State University and University of California, San Diego</affiliation></author>
+      <author><first>Lily</first><last>Sawi</last><affiliation>San Diego State University</affiliation></author>
+      <author><first>Katya</first><last>Hernandez Holliday</last><affiliation>San Diego State University and University of California, San Diego</affiliation></author>
+      <pages>329-336</pages>
+      <abstract>To measure learning with AI, students must be afforded opportunities to use AI consistently across courses. Our interview study of 36 undergraduates revealed that students make independent appraisals of AI fairness amid school policies and use AI inconsistently on school assignments. We discuss tensions for measurement raised from students’ responses.</abstract>
+      <url hash="34322a4a">2025.aimecon-main.35</url>
+      <bibkey>delaney-etal-2025-undergraduate</bibkey>
+    </paper>
+    <paper id="36">
+      <title><fixed-case>AI</fixed-case>-Generated Formative Practice and Feedback: Performance Benchmarks and Applications in Higher Education</title>
+      <author><first>Rachel</first><last>van Campenhout</last><affiliation>VitalSource</affiliation></author>
+      <author><first>Michelle Weaver</first><last>Clark</last><affiliation>VitalSource</affiliation></author>
+      <author><first>Jeffrey S.</first><last>Dittel</last><affiliation>VitalSource</affiliation></author>
+      <author><first>Bill</first><last>Jerome</last><affiliation>VitalSource</affiliation></author>
+      <author><first>Nick</first><last>Brown</last><affiliation>VitalSource</affiliation></author>
+      <author><first>Benny</first><last>Johnson</last><affiliation>VitalSource Technologies</affiliation></author>
+      <pages>337-344</pages>
+      <abstract>Millions of AI-generated formative practice questions across thousands of publisher etextbooks are available for student use in higher education. We review the research to address both performance metrics for questions and feedback calculated from student data, and discuss the importance of successful applications in the classroom to maximize learning potential.</abstract>
+      <url hash="6b0609be">2025.aimecon-main.36</url>
+      <bibkey>van-campenhout-etal-2025-ai</bibkey>
+    </paper>
+    <paper id="37">
+      <title>Beyond Agreement: Rethinking Ground Truth in Educational <fixed-case>AI</fixed-case> Annotation</title>
+      <author><first>Danielle R</first><last>Thomas</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Conrad</first><last>Borchers</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Ken</first><last>Koedinger</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <pages>345-351</pages>
+      <abstract>Humans are biased, inconsistent, and yet we keep trusting them to define “ground truth.” This paper questions the overreliance on inter-rater reliability in educational AI and proposes a multidimensional approach leveraging expert-based approaches and close-the-loop validity to build annotations that reflect impact, not just agreement. It’s time we do better.</abstract>
+      <url hash="86dc63e9">2025.aimecon-main.37</url>
+      <bibkey>thomas-etal-2025-beyond</bibkey>
+    </paper>
+    <paper id="38">
+      <title>Automated search algorithm for optimal generalized linear mixed models (<fixed-case>GLMM</fixed-case>s)</title>
+      <author><first>Miryeong</first><last>Koo</last><affiliation>University of Illinois at Urbana-Champaign</affiliation></author>
+      <author><first>Jinming</first><last>Zhang</last><affiliation>University of Illinois at Urbana-Champaign</affiliation></author>
+      <pages>352-358</pages>
+      <abstract>Only a limited number of predictors can be included in a generalized linear mixed model (GLMM) due to estimation algorithm divergence. This study aims to propose a machine learning based algorithm (e.g., random forest) that can consider all predictors without the convergence issue and automatically searches for the optimal GLMMs.</abstract>
+      <url hash="6b6c1e23">2025.aimecon-main.38</url>
+      <bibkey>koo-zhang-2025-automated</bibkey>
+    </paper>
+    <paper id="39">
+      <title>Exploring the Psychometric Validity of <fixed-case>AI</fixed-case>-Generated Student Responses: A Study on Virtual Personas’ Learning Motivation</title>
+      <author><first>Huanxiao</first><last>Wang</last></author>
+      <pages>359-366</pages>
+      <abstract>This study explores whether large language models (LLMs) can simulate valid student responses for educational measurement. Using GPT-4o, 2000 virtual student personas were generated. Each persona completed the Academic Motivation Scale (AMS). Factor analyses(EFA and CFA) and clustering showed GPT-4o reproduced the AMS structure and distinct motivational subgroups.</abstract>
+      <url hash="b92fcc03">2025.aimecon-main.39</url>
+      <bibkey>wang-2025-exploring</bibkey>
+    </paper>
+    <paper id="40">
+      <title>Measuring Teaching with <fixed-case>LLM</fixed-case>s</title>
+      <author><first>Michael</first><last>Hardy</last><affiliation>Stanford University</affiliation></author>
+      <pages>367-384</pages>
+      <abstract>This paper introduces custom Large Language Models using sentence-level embeddings to measure teaching quality. The models achieve human-level performance in analyzing classroom transcripts, outperforming average human rater correlation. Aggregate model scores align with student learning outcomes, establishing a powerful new methodology for scalable teacher feedback. Important limitations discussed.</abstract>
+      <url hash="de299a3c">2025.aimecon-main.40</url>
+      <bibkey>hardy-2025-measuring</bibkey>
+    </paper>
+    <paper id="41">
+      <title>Simulating Rating Scale Responses with <fixed-case>LLM</fixed-case>s for Early-Stage Item Evaluation</title>
+      <author><first>Onur</first><last>Demirkaya</last><affiliation>Riverside Insights</affiliation></author>
+      <author><first>Hsin-Ro</first><last>Wei</last><affiliation>Riverside Insights</affiliation></author>
+      <author><first>Evelyn</first><last>Johnson</last><affiliation>Riverside Insights</affiliation></author>
+      <pages>385-392</pages>
+      <abstract>This study explores the use of large language models to simulate human responses to Likert-scale items. A DeBERTa-base model fine-tuned with item text and examinee ability emulates a graded response model (GRM). High alignment with GRM probabilities and reasonable threshold recovery support LLMs as scalable tools for early-stage item evaluation.</abstract>
+      <url hash="2c995777">2025.aimecon-main.41</url>
+      <bibkey>demirkaya-etal-2025-simulating</bibkey>
+    </paper>
+    <paper id="42">
+      <title>Bias and Reliability in <fixed-case>AI</fixed-case> Safety Assessment: Multi-Facet Rasch Analysis of Human Moderators</title>
+      <author><first>Chunling</first><last>Niu</last><affiliation>The University of the Incarnate Word</affiliation></author>
+      <author><first>Kelly</first><last>Bradley</last><affiliation>University of Kentucky</affiliation></author>
+      <author><first>Biao</first><last>Ma</last><affiliation>The University of the Incarnate Word</affiliation></author>
+      <author><first>Brian</first><last>Waltman</last><affiliation>The University of the Incarnate Word</affiliation></author>
+      <author><first>Loren</first><last>Cossette</last><affiliation>The University of the Incarnate Word</affiliation></author>
+      <author><first>Rui</first><last>Jin</last><affiliation>Shenzhen University</affiliation></author>
+      <pages>393-397</pages>
+      <abstract>Using Multi-Facet Rasch Modeling on 36,400 safety ratings of AI-generated conversations, we reveal significant racial disparities (Asian 39.1%, White 28.7% detection rates) and content-specific bias patterns. Simulations show that diverse teams of 8-10 members achieve 70%+ reliability versus 62% for smaller homogeneous teams, providing evidence-based guidelines for AI-generated content moderation.</abstract>
+      <url hash="a54d6882">2025.aimecon-main.42</url>
+      <bibkey>niu-etal-2025-bias</bibkey>
+    </paper>
+    <paper id="43">
+      <title>Dynamic <fixed-case>B</fixed-case>ayesian Item Response Model with Decomposition (<fixed-case>D</fixed-case>-<fixed-case>BIRD</fixed-case>): Modeling Cohort and Individual Learning Over Time</title>
+      <author><first>Hansol</first><last>Lee</last><affiliation>Stanford University</affiliation></author>
+      <author><first>Jason B.</first><last>Cho</last><affiliation>Cornell University</affiliation></author>
+      <author><first>David S.</first><last>Matteson</last><affiliation>Cornell University</affiliation></author>
+      <author><first>Benjamin</first><last>Domingue</last><affiliation>Stanford University</affiliation></author>
+      <pages>398-405</pages>
+      <abstract>We present D-BIRD, a Bayesian dynamic item response model for estimating student ability from sparse, longitudinal assessments. By decomposing ability into a cohort trend and individual trajectory, D-BIRD supports interpretable modeling of learning over time. We evaluate parameter recovery in simulation and demonstrate the model using real-world personalized learning data.</abstract>
+      <url hash="5d70f4a1">2025.aimecon-main.43</url>
+      <bibkey>lee-etal-2025-dynamic-bayesian</bibkey>
+    </paper>
+    <paper id="44">
+      <title>Enhancing Essay Scoring with <fixed-case>GPT</fixed-case>-2 Using Back Translation Techniques</title>
+      <author><first>Aysegul</first><last>Gunduz</last><affiliation>University of Alberta</affiliation></author>
+      <author><first>Mark</first><last>Gierl</last><affiliation>University of Alberta</affiliation></author>
+      <author><first>Okan</first><last>Bulut</last><affiliation>University of Alberta</affiliation></author>
+      <pages>406-416</pages>
+      <abstract>This study evaluates GPT-2 (small) for automated essay scoring on the ASAP dataset. Back-translation (English–Turkish–English) improved performance, especially on imbalanced sets. QWK scores peaked at 0.77. Findings highlight augmentation’s value and the need for more advanced, rubric-aware models for fairer assessment.</abstract>
+      <url hash="4a9eb545">2025.aimecon-main.44</url>
+      <bibkey>gunduz-etal-2025-enhancing</bibkey>
+    </paper>
+    <paper id="45">
+      <title>Mathematical Computation and Reasoning Errors by Large Language Models</title>
+      <author><first>Liang</first><last>Zhang</last><affiliation>University of Georgia</affiliation></author>
+      <author><first>Edith</first><last>Graf</last><affiliation>ETS</affiliation></author>
+      <pages>417-424</pages>
+      <abstract>We evaluate four LLMs (GPT-4o, o1, DeepSeek-V3, DeepSeek-R1) on purposely challenging arithmetic, algebra, and number-theory items. Coding final answers and step-level solutions correctness reveals performance gaps, improvement paths, and how accurate LLMs can strengthen mathematics assessment and instruction.</abstract>
+      <url hash="82154af7">2025.aimecon-main.45</url>
+      <bibkey>zhang-graf-2025-mathematical</bibkey>
+    </paper>
+  </volume>
+  <volume id="wip" ingest-date="2025-10-26" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Works in Progress</booktitle>
+      <editor><first>Joshua</first><last>Wilson</last></editor>
+      <editor><first>Christopher</first><last>Ormerod</last></editor>
+      <editor><first>Magdalen</first><last>Beiting Parrish</last></editor>
+      <publisher>National Council on Measurement in Education (NCME)</publisher>
+      <address>Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States</address>
+      <month>October</month>
+      <year>2025</year>
+      <url hash="47d67aca">2025.aimecon-wip</url>
+      <venue>aimecon</venue>
+      <isbn>979-8-218-84229-1</isbn>
+    </meta>
+    <frontmatter>
+      <url hash="63659a6b">2025.aimecon-wip.0</url>
+      <bibkey>aime-con-2025-wip</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Automated Item Neutralization for Non-Cognitive Scales: A Large Language Model Approach to Reducing Social-Desirability Bias</title>
+      <author><first>Sirui</first><last>Wu</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Daijin</first><last>Yang</last><affiliation>Northeastern University</affiliation></author>
+      <pages>1-13</pages>
+      <abstract>This study explores an AI-assisted approach for rewriting personality scale items to reduce social desirability bias. Using GPT-refined neutralized items based on the IPIP-BFM-50, we compare factor structures, item popularity, and correlations with the MC-SDS to evaluate construct validity and the effectiveness of AI-based item refinement in Chinese contexts.</abstract>
+      <url hash="ff878969">2025.aimecon-wip.1</url>
+      <bibkey>wu-yang-2025-automated</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>AI</fixed-case> as a Mind Partner: Cognitive Impact in <fixed-case>P</fixed-case>akistan’s Educational Landscape</title>
+      <author><first>Eman</first><last>Khalid</last></author>
+      <author><first>Hammad</first><last>Javaid</last><affiliation>Lahore University of Management Sciences, LUMS</affiliation></author>
+      <author><first>Yashal</first><last>Waseem</last><affiliation>Lahore University of Management Sciences</affiliation></author>
+      <author><first>Natasha Sohail</first><last>Barlas</last><affiliation>Lahore University of Management Sciences</affiliation></author>
+      <pages>14-19</pages>
+      <abstract>This study explores how high school and university students in Pakistan perceive and use generative AI as a cognitive extension. Drawing on the Extended Mind Theory, impact on critical thinking, and ethics are evaluated. Findings reveal over-reliance, mixed emotional responses, and institutional uncertainty about AI’s role in learning.</abstract>
+      <url hash="f73627a7">2025.aimecon-wip.2</url>
+      <bibkey>khalid-etal-2025-ai</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Detecting Math Misconceptions: An <fixed-case>AI</fixed-case> Benchmark Dataset</title>
+      <author><first>Bethany</first><last>Rittle-Johnson</last><affiliation>Vanderbilt University</affiliation></author>
+      <author><first>Rebecca</first><last>Adler</last><affiliation>Vanderbilt University</affiliation></author>
+      <author><first>Kelley</first><last>Durkin</last><affiliation>Vanderbilt University</affiliation></author>
+      <author><first>L</first><last>Burleigh</last><affiliation>The Learning Agency</affiliation></author>
+      <author><first>Jules</first><last>King</last><affiliation>The Learning Agency</affiliation></author>
+      <author><first>Scott</first><last>Crossley</last><affiliation>Vanderbilt University</affiliation></author>
+      <pages>20-24</pages>
+      <abstract>To harness the promise of AI for improving math education, AI models need to be able to diagnose math misconceptions. We created an AI benchmark dataset on math misconceptions and other instructionally-relevant errors, comprising over 52,000 explanations written over 15 math questions that were scored by expert human raters.</abstract>
+      <url hash="4c2dcf0c">2025.aimecon-wip.3</url>
+      <bibkey>rittle-johnson-etal-2025-detecting</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Optimizing Opportunity: An <fixed-case>AI</fixed-case>-Driven Approach to Redistricting for Fairer School Funding</title>
+      <author><first>Jordan</first><last>Abbott</last><affiliation>New America, Education Funding Equity Initiative</affiliation></author>
+      <pages>25-33</pages>
+      <abstract>We address national educational inequity driven by school district boundaries using a comparative AI framework. Our models, which redraw boundaries from scratch or consolidate existing districts, generate evidence-based plans that reduce funding and segregation disparities, offering policymakers scalable, data-driven solutions for systemic reform.</abstract>
+      <url hash="44116337">2025.aimecon-wip.4</url>
+      <bibkey>abbott-2025-optimizing</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Automatic Grading of Student Work Using Simulated Rubric-Based Data and <fixed-case>G</fixed-case>en<fixed-case>AI</fixed-case> Models</title>
+      <author><first>Yiyao</first><last>Yang</last><affiliation>Teachers College, Columbia University</affiliation></author>
+      <author><first>Yasemin</first><last>Gulbahar</last><affiliation>Teachers College, Columbia University</affiliation></author>
+      <pages>34-39</pages>
+      <abstract>Grading assessment in data science faces challenges related to scalability, consistency, and fairness. Synthetic dataset and GenAI enable us to simulate realistic code samples and automatically evaluate using rubric-driven systems. The research proposes an automatic grading system for generated Python code samples and explores GenAI grading reliability through human-AI comparison.</abstract>
+      <url hash="90e2b4e3">2025.aimecon-wip.5</url>
+      <bibkey>yang-gulbahar-2025-automatic</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Cognitive Engagement in <fixed-case>G</fixed-case>en<fixed-case>AI</fixed-case> Tutor Conversations: At-scale Measurement and Impact on Learning</title>
+      <author><first>Kodi</first><last>Weatherholtz</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Kelli Millwood</first><last>Hill</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Kristen</first><last>Dicerbo</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Walt</first><last>Wells</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Phillip</first><last>Grimaldi</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Maya</first><last>Miller-Vedam</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Charles</first><last>Hogg</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Bogdan</first><last>Yamkovenko</last><affiliation>Khan Academy</affiliation></author>
+      <pages>40-48</pages>
+      <abstract>We developed and validated a scalable LLM-based labeler for classifying student cognitive engagement in GenAI tutoring conversations. Higher engagement levels predicted improved next-item performance, though further research is needed to assess distal transfer and to disentangle effects of continued tutor use from true learning transfer.</abstract>
+      <url hash="dc426c8e">2025.aimecon-wip.6</url>
+      <bibkey>weatherholtz-etal-2025-cognitive</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Chain-of-Thought Prompting for Automated Evaluation of Revision Patterns in Young Student Writing</title>
+      <author><first>Tianwen</first><last>Li</last><affiliation>University of Pittsburgh</affiliation></author>
+      <author><first>Michelle</first><last>Hong</last><affiliation>University of Pittsburgh</affiliation></author>
+      <author><first>Lindsay Clare</first><last>Matsumura</last><affiliation>University of Pittsburgh</affiliation></author>
+      <author><first>Elaine Lin</first><last>Wang</last><affiliation>Rand Corporation</affiliation></author>
+      <author><first>Diane</first><last>Litman</last></author>
+      <author><first>Zhexiong</first><last>Liu</last><affiliation>University of Pittsburgh</affiliation></author>
+      <author><first>Richard</first><last>Correnti</last><affiliation>University of Pittsburgh</affiliation></author>
+      <pages>49-65</pages>
+      <abstract>This study explores the use of ChatGPT-4.1 as a formative assessment tool for identifying revision patterns in young adolescents’ argumentative writing. ChatGPT-4.1 shows moderate agreement with human coders on identifying evidence-related revision patterns and fair agreement on explanation-related ones. Implications for LLM-assisted formative assessment of young adolescent writing are discussed.</abstract>
+      <url hash="0515bc96">2025.aimecon-wip.7</url>
+      <bibkey>li-etal-2025-chain-thought</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Predicting and Evaluating Item Responses Using Machine Learning, Text Embeddings, and <fixed-case>LLM</fixed-case>s</title>
+      <author><first>Evelyn</first><last>Johnson</last><affiliation>Riverside Insights</affiliation></author>
+      <author><first>Hsin-Ro</first><last>Wei</last><affiliation>Riverside Insights</affiliation></author>
+      <author><first>Tong</first><last>Wu</last></author>
+      <author><first>Huan</first><last>Liu</last><affiliation>Riverside Insights</affiliation></author>
+      <pages>66-70</pages>
+      <abstract>This work-in-progress study compares the accuracy of machine learning and large language models to predict student responses to field-test items on a social-emotional learning assessment. We evaluate how well each method replicates actual responses and examine the item parameters generated by synthetic data to those derived from actual student data.</abstract>
+      <url hash="cfd18b31">2025.aimecon-wip.8</url>
+      <bibkey>johnson-etal-2025-predicting</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Evaluating <fixed-case>LLM</fixed-case>-Based Automated Essay Scoring: Accuracy, Fairness, and Validity</title>
+      <author><first>Yue</first><last>Huang</last><affiliation>Measurement Incorporated</affiliation></author>
+      <author><first>Joshua</first><last>Wilson</last><affiliation>University of Delaware</affiliation></author>
+      <pages>71-83</pages>
+      <abstract>This study evaluates large language models (LLMs) for automated essay scoring (AES), comparing prompt strategies and fairness across student groups. We found that well-designed prompting helps LLMs approach traditional AES performance, but both differ from human scores for ELLs—the traditional model shows larger overrall gaps, while LLMs show subtler disparities.</abstract>
+      <url hash="7b3c3e1f">2025.aimecon-wip.9</url>
+      <bibkey>huang-wilson-2025-evaluating</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Comparing <fixed-case>AI</fixed-case> tools and Human Raters in Predicting Reading Item Difficulty</title>
+      <author><first>Hongli</first><last>Li</last><affiliation>Georgia State University</affiliation></author>
+      <author><first>Roula</first><last>Aldib</last><affiliation>Georgia State University</affiliation></author>
+      <author><first>Chad</first><last>Marchong</last><affiliation>Georgia State University</affiliation></author>
+      <author><first>Kevin</first><last>Fan</last><affiliation>Fulton County Schools</affiliation></author>
+      <pages>84-89</pages>
+      <abstract>This study compares AI tools and human raters in predicting the difficulty of reading comprehension items without response data. Predictions from AI models (ChatGPT, Gemini, Claude, and DeepSeek) and human raters are evaluated against empirical difficulty values derived from student responses. Findings will inform AI’s potential to support test development.</abstract>
+      <url hash="79204077">2025.aimecon-wip.10</url>
+      <bibkey>li-etal-2025-comparing</bibkey>
+    </paper>
+    <paper id="11">
+      <title>When Machines Mislead: Human Review of Erroneous <fixed-case>AI</fixed-case> Cheating Signals</title>
+      <author><first>William</first><last>Belzak</last><affiliation>Duolingo</affiliation></author>
+      <author><first>Chenhao</first><last>Niu</last><affiliation>Duolingo, Inc.</affiliation></author>
+      <author><first>Angel</first><last>Ortmann Lee</last><affiliation>Duolingo, Inc.</affiliation></author>
+      <pages>90-97</pages>
+      <abstract>This study examines how human proctors interpret AI-generated alerts for misconduct in remote assessments. Findings suggest proctors can identify false positives, though confirmation bias and differences across test-taker nationalities were observed. Results highlight opportunities to refine proctoring guidelines and strengthen fairness in human oversight of automated signals in high-stakes testing.</abstract>
+      <url hash="289e4a92">2025.aimecon-wip.11</url>
+      <bibkey>belzak-etal-2025-machines</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Fairness in Formative <fixed-case>AI</fixed-case>: Cognitive Complexity in Chatbot Questions Across Research Topics</title>
+      <author><first>Alexandra Barry</first><last>Colbert</last><affiliation>College Board</affiliation></author>
+      <author><first>Karen D</first><last>Wang</last><affiliation>School of Information, San Jose State University</affiliation></author>
+      <pages>98-106</pages>
+      <abstract>This study evaluates whether questions generated from a socratic-style research AI chatbot designed to support project-based AP courses maintains cognitive complexity parity when inputted with research topics of controversial and non-controversial nature. We present empirical findings indicating no significant conversational complexity differences, highlighting implications for equitable AI use in formative assessment.</abstract>
+      <url hash="6c4b19cd">2025.aimecon-wip.12</url>
+      <bibkey>colbert-wang-2025-fairness</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Keystroke Analysis in Digital Test Security: <fixed-case>AI</fixed-case> Approaches for Copy-Typing Detection and Cheating Ring Identification</title>
+      <author><first>Chenhao</first><last>Niu</last><affiliation>Duolingo, Inc.</affiliation></author>
+      <author><first>Yong-Siang</first><last>Shih</last><affiliation>Duolingo, Inc.</affiliation></author>
+      <author><first>Manqian</first><last>Liao</last><affiliation>Duolingo</affiliation></author>
+      <author><first>Ruidong</first><last>Liu</last><affiliation>Duolingo, Inc.</affiliation></author>
+      <author><first>Angel</first><last>Ortmann Lee</last><affiliation>Duolingo, Inc.</affiliation></author>
+      <pages>107-116</pages>
+      <abstract>This project leverages AI-based analysis of keystroke and mouse data to detect copy-typing and identify cheating rings in the Duolingo English Test. By modeling behavioral biometrics, the approach provides actionable signals to proctors, enhancing digital test security for large-scale online assessment.</abstract>
+      <url hash="9237ed86">2025.aimecon-wip.13</url>
+      <bibkey>niu-etal-2025-keystroke</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Talking to Learn: A <fixed-case>S</fixed-case>o<fixed-case>TL</fixed-case> Study of Generative <fixed-case>AI</fixed-case>-Facilitated Feynman Reviews</title>
+      <author><first>Madeline Rose</first><last>Mattox</last><affiliation>University of Virginia</affiliation></author>
+      <author><first>Natalie</first><last>Hutchins</last><affiliation>University of Virginia</affiliation></author>
+      <author><first>Jamie J</first><last>Jirout</last><affiliation>University of Virginia</affiliation></author>
+      <pages>117-124</pages>
+      <abstract>Structured Generative AI interactions have potential for scaffolding learning. This Scholarship of Teaching and Learning study analyzes 16 undergraduate students’ Feynman-style AI interactions (N=157) across a semester-long child-development course. Qualitative coding of the interactions explores engagement patterns, metacognitive support, and response consistency, informing ethical AI integration in higher education.</abstract>
+      <url hash="72259d32">2025.aimecon-wip.14</url>
+      <bibkey>mattox-etal-2025-talking</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>AI</fixed-case>-Powered Coding of Elementary Students’ Small-Group Discussions about Text</title>
+      <author><first>Carla</first><last>Firetto</last><affiliation>Arizona State University</affiliation></author>
+      <author><first>P. Karen</first><last>Murphy</last><affiliation>The Pennsylvania State University</affiliation></author>
+      <author><first>Lin</first><last>Yan</last><affiliation>Arizona State University</affiliation></author>
+      <author><first>Yue</first><last>Tang</last><affiliation>The Pennsylvania State University</affiliation></author>
+      <pages>125-134</pages>
+      <abstract>We report reliability and validity evidence for an AI-powered coding of 371 small-group discussion transcripts. Evidence via comparability and ground truth checks suggested high consistency between AI-produced and human-produced codes. Research in progress is also investigating reliability and validity of a new “quality” indicator to complement the current coding.</abstract>
+      <url hash="067512d9">2025.aimecon-wip.15</url>
+      <bibkey>firetto-etal-2025-ai</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Evaluating the Reliability of Human–<fixed-case>AI</fixed-case> Collaborative Scoring of Written Arguments Using Rational Force Model</title>
+      <author><first>Noriko</first><last>Takahashi</last><affiliation>M.S. in Computational Linguistics, Montclair State University</affiliation></author>
+      <author><first>Abraham</first><last>Onuorah</last><affiliation>PhD in Teacher Education and Teacher Development, Montclair State University</affiliation></author>
+      <author><first>Alina</first><last>Reznitskaya</last><affiliation>Montclair State University</affiliation></author>
+      <author><first>Evgeny</first><last>Chukharev</last><affiliation>Iowa State University</affiliation></author>
+      <author><first>Ariel</first><last>Sykes</last><affiliation>Montclair State University</affiliation></author>
+      <author><first>Michele</first><last>Flammia</last><affiliation>Independent researcher</affiliation></author>
+      <author><first>Joe</first><last>Oyler</last><affiliation>Maynooth University</affiliation></author>
+      <pages>135-140</pages>
+      <abstract>This study aims to improve the reliability of a new AI collaborative scoring system used to assess the quality of students’ written arguments. The system draws on the Rational Force Model and focuses on classifying the functional relation of each proposition in terms of support, opposition, acceptability, and relevance.</abstract>
+      <url hash="d1fdcdb5">2025.aimecon-wip.16</url>
+      <bibkey>takahashi-etal-2025-evaluating</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Evaluating Deep Learning and Transformer Models on <fixed-case>SME</fixed-case> and <fixed-case>G</fixed-case>en<fixed-case>AI</fixed-case> Items</title>
+      <author><first>Joe</first><last>Betts</last><affiliation>National Council of State Boards of Nursing</affiliation></author>
+      <author><first>William</first><last>Muntean</last><affiliation>National Council of State Boards of Nursing</affiliation></author>
+      <pages>141-146</pages>
+      <abstract>This study leverages deep learning, transformer models, and generative AI to streamline test development by automating metadata tagging and item generation. Transformer models outperform simpler approaches, reducing SME workload. Ongoing research refines complex models and evaluates LLM-generated items, enhancing efficiency in test creation.</abstract>
+      <url hash="4cf95164">2025.aimecon-wip.17</url>
+      <bibkey>betts-muntean-2025-evaluating</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Comparison of <fixed-case>AI</fixed-case> and Human Scoring on A Visual Arts Assessment</title>
+      <author><first>Ning</first><last>Jiang</last><affiliation>Measurement Incorporated</affiliation></author>
+      <author><first>Yue</first><last>Huang</last><affiliation>Measurement Incorporated</affiliation></author>
+      <author><first>Jie</first><last>Chen</last><affiliation>Measurement Incorporated</affiliation></author>
+      <pages>147-154</pages>
+      <abstract>This study examines reliability and comparability of Generative AI scores versus human ratings on two performance tasks—text-based and drawing-based—in a fourth-grade visual arts assessment. Results show GPT-4 is consistent, aligned with humans but more lenient, and its agreement with humans is slightly lower than that between human raters.</abstract>
+      <url hash="eeeaaa3e">2025.aimecon-wip.18</url>
+      <bibkey>jiang-etal-2025-comparison</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Explainable Writing Scores via Fine-grained, <fixed-case>LLM</fixed-case>-Generated Features</title>
+      <author><first>James V</first><last>Bruno</last><affiliation>Pearson</affiliation></author>
+      <author><first>Lee</first><last>Becker</last><affiliation>Pearson</affiliation></author>
+      <pages>155-165</pages>
+      <abstract>Advancements in deep learning have enhanced Automated Essay Scoring (AES) accuracy but reduced interpretability. This paper investigates using LLM-generated features to train an explainable scoring model. By framing feature engineering as prompt engineering, state-of-the-art language technology can be integrated into simpler, more interpretable AES models.</abstract>
+      <url hash="47c791c7">2025.aimecon-wip.19</url>
+      <bibkey>bruno-becker-2025-explainable</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Validating Generative <fixed-case>AI</fixed-case> Scoring of Constructed Responses with Cognitive Diagnosis</title>
+      <author><first>Hyunjoo</first><last>Kim</last><affiliation>University of Illinois Urbana-Champaign</affiliation></author>
+      <pages>166-177</pages>
+      <abstract>This research explores the feasibility of applying the cognitive diagnosis assessment (CDA) framework to validate generative AI-based scoring of constructed responses (CRs). The classification information of CRs and item-parameter estimates from cognitive diagnosis models (CDMs) could provide additional validity evidence for AI-generated CR scores and feedback.</abstract>
+      <url hash="fe3d1ee4">2025.aimecon-wip.20</url>
+      <bibkey>kim-2025-validating</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Automated Diagnosis of Students’ Number Line Strategies for Fractions</title>
+      <author><first>Zhizhi</first><last>Wang</last><affiliation>Rutgers University</affiliation></author>
+      <author><first>Dake</first><last>Zhang</last><affiliation>Rutgers University</affiliation></author>
+      <author><first>Min</first><last>Li</last><affiliation>University of Washington</affiliation></author>
+      <author><first>Yuhan</first><last>Tao</last><affiliation>Columbia University</affiliation></author>
+      <pages>178-184</pages>
+      <abstract>This study aims to develop and evaluate an AI-based platform that automatically grade and classify problem-solving strategies and error types in students’ handwritten fraction representations involving number lines. The model development procedures, and preliminary evaluation results comparing with available LLMs and human expert annotations are reported.</abstract>
+      <url hash="108fb29e">2025.aimecon-wip.21</url>
+      <bibkey>wang-etal-2025-automated</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Medical Item Difficulty Prediction Using Machine Learning</title>
+      <author><first>Hope Oluwaseun</first><last>Adegoke</last><affiliation>University of North Carolina, Greensboro</affiliation></author>
+      <author><first>Ying</first><last>Du</last><affiliation>American Board of Pediatrics</affiliation></author>
+      <author><first>Andrew</first><last>Dwyer</last><affiliation>American Board of Pediatrics</affiliation></author>
+      <pages>185-190</pages>
+      <abstract>This project aims to use machine learning models to predict a medical exam item difficulty by combining item metadata, linguistic features, word embeddings, and semantic similarity measures with a sample size of 1000 items. The goal is to improve the accuracy of difficulty prediction in medical assessment.</abstract>
+      <url hash="f9e0ac85">2025.aimecon-wip.22</url>
+      <bibkey>adegoke-etal-2025-medical</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Examining decoding items using engine transcriptions and scoring in early literacy assessment</title>
+      <author><first>Zachary</first><last>Schultz</last><affiliation>Cambium Learning Group, Inc.</affiliation></author>
+      <author><first>Mackenzie</first><last>Young</last></author>
+      <author><first>Debbie</first><last>Dugdale</last><affiliation>Cambium Assessment, Inc.</affiliation></author>
+      <author><first>Susan</first><last>Lottridge</last><affiliation>Cambium Assessment</affiliation></author>
+      <pages>191-196</pages>
+      <abstract>We investigate the reliability of two scoring approaches to early literacy decoding items, whereby students are shown a word and asked to say it aloud. Approaches were rubric scoring of speech, human or AI transcription with varying explicit scoring rules. Initial results suggest rubric-based approaches perform better than transcription-based methods.</abstract>
+      <url hash="7796ee0c">2025.aimecon-wip.23</url>
+      <bibkey>schultz-etal-2025-examining</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Addressing Few-Shot <fixed-case>LLM</fixed-case> Classification Instability Through Explanation-Augmented Distillation</title>
+      <author><first>William</first><last>Muntean</last><affiliation>National Council of State Boards of Nursing</affiliation></author>
+      <author><first>Joe</first><last>Betts</last><affiliation>National Council of State Boards of Nursing</affiliation></author>
+      <pages>197-203</pages>
+      <abstract>This study compares explanation-augmented knowledge distillation with few-shot in-context learning for LLM-based exam question classification. Fine-tuned smaller language models achieved competitive performance with greater consistency than large mode few-shot approaches, which exhibited notable variability across different examples. Hyperparameter selection proved essential, with extremely low learning rates significantly impairing model performance.</abstract>
+      <url hash="8dbedf9e">2025.aimecon-wip.24</url>
+      <bibkey>muntean-betts-2025-addressing</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Identifying Biases in Large Language Model Assessment of Linguistically Diverse Texts</title>
+      <author><first>Lionel Hsien</first><last>Meng</last><affiliation>University of Wisconsin - Madison</affiliation></author>
+      <author><first>Shamya</first><last>Karumbaiah</last><affiliation>University of Wisconsin - Madison</affiliation></author>
+      <author><first>Vivek</first><last>Saravanan</last><affiliation>University of Wisconsin - Madison</affiliation></author>
+      <author><first>Daniel</first><last>Bolt</last><affiliation>University of Wisconsin - Madison</affiliation></author>
+      <pages>204-210</pages>
+      <abstract>The development of Large Language Models (LLMs) to assess student text responses is rapidly progressing but evaluating whether LLMs equitably assess multilingual learner responses is an important precursor to adoption. Our study provides an example procedure for identifying and quantifying bias in LLM assessment of student essay responses.</abstract>
+      <url hash="26357f20">2025.aimecon-wip.25</url>
+      <bibkey>meng-etal-2025-identifying</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Implicit Biases in Large Vision–Language Models in Classroom Contexts</title>
+      <author><first>Peter</first><last>Baldwin</last><affiliation>National Board of Medical Examiners</affiliation></author>
+      <pages>211-217</pages>
+      <abstract>Using a counterfactual, adversarial, audit-style approach, we tested whether ChatGPT-4o evaluates classroom lectures differently based on teacher demographics. The model was told only to rate lecture excerpts embedded within classroom images—without reference to the images themselves. Despite this, ratings varied systematically by teacher race and sex, revealing implicit bias.</abstract>
+      <url hash="e387b5d8">2025.aimecon-wip.26</url>
+      <bibkey>baldwin-2025-implicit</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Enhancing Item Difficulty Prediction in Large-scale Assessment with Large Language Model</title>
+      <author><first>Mubarak</first><last>Mojoyinola</last></author>
+      <author><first>Olasunkanmi James</first><last>Kehinde</last><affiliation>Norfolk State University</affiliation></author>
+      <author><first>Judy</first><last>Tang</last><affiliation>Westat</affiliation></author>
+      <pages>218-222</pages>
+      <abstract>Field testing is a resource-intensive bottleneck in test development. This study applied an interpretable framework that leverages a Large Language Model (LLM) for structured feature extraction from TIMSS items. These features will train several classifiers, whose predictions will be explained using SHAP, providing actionable, diagnostic insights insights for item writers.</abstract>
+      <url hash="7993353a">2025.aimecon-wip.27</url>
+      <bibkey>mojoyinola-etal-2025-enhancing</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Leveraging <fixed-case>LLM</fixed-case>s for Cognitive Skill Mapping in <fixed-case>TIMSS</fixed-case> Mathematics Assessment</title>
+      <author><first>Ruchi J</first><last>Sachdeva</last><affiliation>Pearson</affiliation></author>
+      <author><first>Jung Yeon</first><last>Park</last><affiliation>George Mason University</affiliation></author>
+      <pages>223-228</pages>
+      <abstract>This study evaluates ChatGPT-4’s potential to support validation of Q-matrices and analysis of complex skill–item interactions. By comparing its outputs to expert benchmarks, we assess accuracy, consistency, and limitations, offering insights into how large language models can augment expert judgment in diagnostic assessment and cognitive skill mapping.</abstract>
+      <url hash="74de6b11">2025.aimecon-wip.28</url>
+      <bibkey>sachdeva-park-2025-leveraging</bibkey>
+    </paper>
+  </volume>
+  <volume id="sessions" ingest-date="2025-10-26" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers</booktitle>
+      <editor><first>Joshua</first><last>Wilson</last></editor>
+      <editor><first>Christopher</first><last>Ormerod</last></editor>
+      <editor><first>Magdalen</first><last>Beiting Parrish</last></editor>
+      <publisher>National Council on Measurement in Education (NCME)</publisher>
+      <address>Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States</address>
+      <month>October</month>
+      <year>2025</year>
+      <url hash="1e1cfd1c">2025.aimecon-sessions</url>
+      <venue>aimecon</venue>
+      <isbn>979-8-218-84230-7</isbn>
+    </meta>
+    <frontmatter>
+      <url hash="811ff930">2025.aimecon-sessions.0</url>
+      <bibkey>aime-con-2025-sessions</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>When Does Active Learning Actually Help? Empirical Insights with Transformer-based Automated Scoring</title>
+      <author><first>Justin O</first><last>Barber</last><affiliation>Pearson</affiliation></author>
+      <author><first>Michael P.</first><last>Hemenway</last><affiliation>Pearson</affiliation></author>
+      <author><first>Edward</first><last>Wolfe</last><affiliation>Pearson</affiliation></author>
+      <pages>1-8</pages>
+      <abstract>Developing automated essay scoring (AES) systems typically demands extensive human annotation, incurring significant costs and requiring considerable time. Active learning (AL) methods aim to alleviate this challenge by strategically selecting the most informative essays for scoring, thereby potentially reducing annotation requirements without compromising model accuracy. This study systematically evaluates four prominent AL strategies—uncertainty sampling, BatchBALD, BADGE, and a novel GenAI-based uncertainty approach—against a random sampling baseline, using DeBERTa-based regression models across multiple assessment prompts exhibiting varying degrees of human scorer agreement. Contrary to initial expectations, we found that AL methods provided modest but meaningful improvements only for prompts characterized by poor scorer reliability (&lt;60% agreement per score point). Notably, extensive hyperparameter optimization alone substantially reduced the annotation budget required to achieve near-optimal scoring performance, even with random sampling. Our findings underscore that while targeted AL methods can be beneficial in contexts of low scorer reliability, rigorous hyperparameter tuning remains a foundational and highly effective strategy for minimizing annotation costs in AES system development.</abstract>
+      <url hash="c79bd416">2025.aimecon-sessions.1</url>
+      <bibkey>barber-etal-2025-active</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Automated Essay Scoring Incorporating Annotations from Automated Feedback Systems</title>
+      <author><first>Christopher</first><last>Ormerod</last><affiliation>Cambium Assessment</affiliation></author>
+      <pages>9-18</pages>
+      <abstract>This study illustrates how incorporating feedback-oriented annotations into the scoring pipeline can enhance the accuracy of automated essay scoring (AES). This approach is demonstrated with the Persuasive Essays for Rating, Selecting, and Understanding Argumentative and Discourse Elements (PERSUADE) corpus. We integrate two types of feedback-driven annotations: those that identify spelling and grammatical errors, and those that highlight argumentative components. To illustrate how this method could be applied in real-world scenarios, we employ two LLMs to generate annotations – a generative language model used for spell correction and an encoder-based token-classifier trained to identify and mark argumentative elements. By incorporating annotations into the scoring process, we demonstrate improvements in performance using encoder-based large language models fine-tuned as classifiers.</abstract>
+      <url hash="d4f78668">2025.aimecon-sessions.2</url>
+      <bibkey>ormerod-2025-automated</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Text-Based Approaches to Item Alignment to Content Standards in Large-Scale Reading &amp; Writing Tests</title>
+      <author><first>Yanbin</first><last>Fu</last><affiliation>University of Maryland, College Park</affiliation></author>
+      <author><first>Hong</first><last>Jiao</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Tianyi</first><last>Zhou</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Nan</first><last>Zhang</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Ming</first><last>Li</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Qingshu</first><last>Xu</last><affiliation>University of Maryland, College Park</affiliation></author>
+      <author><first>Sydney</first><last>Peters</last><affiliation>University of Maryland, College Park</affiliation></author>
+      <author><first>Robert W</first><last>Lissitz</last><affiliation>University of Maryland, College Park</affiliation></author>
+      <pages>19-36</pages>
+      <abstract>Aligning test items to content standards is a critical step in test development to collect validity evidence 3 based on content. Item alignment has typically been conducted by human experts, but this judgmental process can be subjective and time-consuming. This study investigated the performance of fine-tuned small language models (SLMs) for automated item alignment using data from a large-scale standardized reading and writing test for college admissions. Different SLMs were trained for both domain and skill alignment. The model performance was evaluated using precision, recall, accuracy, weighted F1 score, and Cohen’s kappa on two test sets. The impact of input data types and training sample sizes was also explored. Results showed that including more textual inputs led to better performance gains than increasing sample size. For comparison, classic supervised machine learning classifiers were trained on multilingual-E5 embedding. Fine-tuned SLMs consistently outperformed these models, particularly for fine-grained skill alignment. To better understand model classifications, semantic similarity analyses including cosine similarity, Kullback-Leibler divergence of embedding distributions, and two-dimension projections of item embedding revealed that certain skills in the two test datasets were semantically too close, providing evidence for the observed misclassification patterns.</abstract>
+      <url hash="ea00b48b">2025.aimecon-sessions.3</url>
+      <bibkey>fu-etal-2025-text</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Review of Text-Based Approaches to Item Difficulty Modeling in Large-Scale Assessments</title>
+      <author><first>Sydney</first><last>Peters</last><affiliation>University of Maryland, College Park</affiliation></author>
+      <author><first>Nan</first><last>Zhang</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Hong</first><last>Jiao</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Ming</first><last>Li</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Tianyi</first><last>Zhou</last><affiliation>University of Maryland</affiliation></author>
+      <pages>37-47</pages>
+      <abstract>Item difficulty plays a crucial role in evaluating item quality, test form assembly, and interpretation of scores in large-scale assessments. Traditional approaches to estimate item difficulty rely on item response data collected in field testing, which can be time-consuming and costly. To overcome these challenges, text-based approaches leveraging machine learning and natural language processing have emerged as promising alternatives. This paper reviews and synthesizes 37 articles on automated item difficulty prediction in large-scale assessments. Each study is synthesized in terms of the dataset, difficulty parameter, subject domain, item type, number of items, training and test data split, input, features, model, evaluation criteria, and model performance outcomes. Overall, text-based models achieved moderate to high predictive performance, highlighting the potential of text-based item difficulty modeling to enhance the current practices of item quality evaluation.</abstract>
+      <url hash="0cb48864">2025.aimecon-sessions.4</url>
+      <bibkey>peters-etal-2025-review</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Item Difficulty Modeling Using Fine-Tuned Small and Large Language Models</title>
+      <author><first>Ming</first><last>Li</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Hong</first><last>Jiao</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Tianyi</first><last>Zhou</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Nan</first><last>Zhang</last><affiliation>University of Maryland</affiliation></author>
+      <author><first>Sydney</first><last>Peters</last><affiliation>University of Maryland, College Park</affiliation></author>
+      <author><first>Robert W</first><last>Lissitz</last><affiliation>University of Maryland, College Park</affiliation></author>
+      <pages>48-55</pages>
+      <abstract>This study investigates methods for item difficulty modeling in large-scale assessments using both small and large language models. We introduce novel data augmentation strategies, including on-the-fly augmentation and distribution balancing, that surpass benchmark performances, demonstrating their effectiveness in mitigating data imbalance and improving model performance. Our results showed that fine-tuned small language models such as BERT and RoBERTa yielded lower root mean squared error than the first-place winning model in the BEA 2024 Shared Task competition, whereas domain-specific models like BioClinicalBERT and PubMedBERT did not provide significant improvements due to distributional gaps. Majority voting among small language models enhanced prediction accuracy, reinforcing the benefits of ensemble learning. Large language models (LLMs), such as GPT-4, exhibited strong generalization capabilities but struggled with item difficulty prediction, likely due to limited training data and the absence of explicit difficulty-related context. Chain-of-thought prompting and rationale generation approaches were explored but did not yield substantial improvements, suggesting that additional training data or more sophisticated reasoning techniques may be necessary. Embedding-based methods, particularly using NV-Embed-v2, showed promise but did not outperform our best augmentation strategies, indicating that capturing nuanced difficulty-related features remains a challenge.</abstract>
+      <url hash="1e53e10a">2025.aimecon-sessions.5</url>
+      <bibkey>li-etal-2025-item</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Operational Alignment of Confidence-Based Flagging Methods in Automated Scoring</title>
+      <author><first>Corey</first><last>Palermo</last><affiliation>Measurement Incorporated</affiliation></author>
+      <author><first>Troy</first><last>Chen</last><affiliation>Measurement Incorporated</affiliation></author>
+      <author><first>Arianto</first><last>Wibowo</last><affiliation>Measurement Incorporated</affiliation></author>
+      <pages>56-60</pages>
+      <abstract>Correct answers to math problems don’t reveal if students understand concepts or just memorized procedures. Conversation-Based Assessment (CBA) addresses this through AI dialogue, but reliable scoring requires costly pilots and specialized expertise. Our Criteria Development Platform (CDP) enables pre-pilot optimization using synthetic data, reducing development from months to days. Testing 17 math items through 68 iterations, all achieved our reliability threshold (MCC ≥ 0.80) after refinement – up from 59% initially. Without refinement, 7 items would have remained below this threshold. By making reliability validation accessible, CDP empowers educators to develop assessments meeting automated scoring standards.</abstract>
+      <url hash="417648e6">2025.aimecon-sessions.6</url>
+      <bibkey>palermo-etal-2025-operational</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Pre-Pilot Optimization of Conversation-Based Assessment Items Using Synthetic Response Data</title>
+      <author><first>Tyler</first><last>Burleigh</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Jing</first><last>Chen</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Kristen</first><last>Dicerbo</last><affiliation>Khan Academy</affiliation></author>
+      <pages>61-68</pages>
+      <abstract>Story retell assessments provide valuable insights into reading comprehension but face implementation barriers due to time-intensive administration and scoring. This study examines whether Large Language Models (LLMs) can reliably replicate human judgment in grading story retells. Using a novel dataset, we conduct three complementary studies examining LLM performance across different rubric systems, agreement patterns, and reasoning alignment. We find that LLMs (a) achieve near-human reliability with appropriate rubric design, (b) perform well on easy-to-grade cases but poorly on ambiguous ones, (c) produce explanations for their grades that are plausible for straightforward cases but unreliable for complex ones, and (d) different LLMs display consistent “grading personalities” (systematically scoring harder or easier across all student responses). These findings support hybrid assessment architectures where AI handles routine scoring, enabling more frequent formative assessment while directing teacher expertise toward students requiring nuanced support.</abstract>
+      <url hash="a9a4a8ed">2025.aimecon-sessions.7</url>
+      <bibkey>burleigh-etal-2025-pre</bibkey>
+    </paper>
+    <paper id="8">
+      <title>When Humans Can’t Agree, Neither Can Machines: The Promise and Pitfalls of <fixed-case>LLM</fixed-case>s for Formative Literacy Assessment</title>
+      <author><first>Owen</first><last>Henkel</last><affiliation>Universitu of Oxford</affiliation></author>
+      <author><first>Kirk</first><last>Vanacore</last><affiliation>Cornell University</affiliation></author>
+      <author><first>Bill</first><last>Roberts</last><affiliation>Legible Labs</affiliation></author>
+      <pages>69-78</pages>
+      <abstract>Story retell assessments provide valuable insights into reading comprehension but face implementation barriers due to time-intensive administration and scoring. This study examines whether Large Language Models (LLMs) can reliably replicate human judgment in grading story retells. Using a novel dataset, we conduct three complementary studies examining LLM performance across different rubric systems, agreement patterns, and reasoning alignment. We find that LLMs (a) achieve near-human reliability with appropriate rubric design, (b) perform well on easy-to-grade cases but poorly on ambiguous ones, (c) produce explanations for their grades that are plausible for straightforward cases but unreliable for complex ones, and (d) different LLMs display consistent “grading personalities” (systematically scoring harder or easier across all student responses). These findings support hybrid assessment architectures where AI handles routine scoring, enabling more frequent formative assessment while directing teacher expertise toward students requiring nuanced support.</abstract>
+      <url hash="4d68e02d">2025.aimecon-sessions.8</url>
+      <bibkey>henkel-etal-2025-humans</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Beyond the Hint: Using Self-Critique to Constrain <fixed-case>LLM</fixed-case> Feedback in Conversation-Based Assessment</title>
+      <author><first>Tyler</first><last>Burleigh</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Jenny</first><last>Han</last><affiliation>Khan Academy</affiliation></author>
+      <author><first>Kristen</first><last>Dicerbo</last><affiliation>Khan Academy</affiliation></author>
+      <pages>79-85</pages>
+      <abstract>Large Language Models in Conversation-Based Assessment tend to provide inappropriate hints that compromise validity. We demonstrate that self-critique – a simple prompt engineering technique – effectively constrains this behavior.Through two studies using synthetic conversations and real-world high school math pilot data, self-critique reduced inappropriate hints by 90.7% and 24-75% respectively. Human experts validated ground truth labels while LLM judges enabled scale. This immediately deployable solution addresses the critical tension in intermediate-stakes assessment: maintaining student engagement while ensuring fair comparisons. Our findings show prompt engineering can meaningfully safeguard assessment integrity without model fine-tuning.</abstract>
+      <url hash="6e1eb90b">2025.aimecon-sessions.9</url>
+      <bibkey>burleigh-etal-2025-beyond</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Investigating Adversarial Robustness in <fixed-case>LLM</fixed-case>-based <fixed-case>AES</fixed-case></title>
+      <author><first>Renjith</first><last>Ravindran</last><affiliation>ETS</affiliation></author>
+      <author><first>Ikkyu</first><last>Choi</last><affiliation>ETS</affiliation></author>
+      <pages>86-91</pages>
+      <abstract>Automated Essay Scoring (AES) is one of the most widely studied applications of Natural Language Processing (NLP) in education and educational measurement. Recent advances with pre-trained Transformer-based large language models (LLMs) have shifted AES from feature-based modeling to leveraging contextualized language representations. These models provide rich semantic representations that substantially improve scoring accuracy and human–machine agreement compared to systems relying on handcrafted features. However, their robustness towards adversarially crafted inputs remains poorly understood. In this study, we define adversarial input as any modification of the essay text designed to fool an automated scoring system into assigning an inflated score. We evaluate a fine-tuned DeBERTa-based AES model on such inputs and show that it is highly susceptible to a simple text duplication attack, highlighting the need to consider adversarial robustness alongside accuracy in the development of AES systems.</abstract>
+      <url hash="616b8dab">2025.aimecon-sessions.10</url>
+      <bibkey>ravindran-choi-2025-investigating</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Effects of Generation Model on Detecting <fixed-case>AI</fixed-case>-generated Essays in a Writing Test</title>
+      <author><first>Jiyun</first><last>Zu</last><affiliation>ETS</affiliation></author>
+      <author><first>Michael</first><last>Fauss</last><affiliation>ETS</affiliation></author>
+      <author><first>Chen</first><last>Li</last><affiliation>ETS</affiliation></author>
+      <pages>92-98</pages>
+      <abstract>Various detectors have been developed to detect AI-generated essays using labeled datasets of human-written and AI-generated essays, with many reporting high detection accuracy. In real-world settings, essays may be generated by models different from those used to train the detectors. This study examined the effects of generation model on detector performance. We focused on two generation models – GPT-3.5 and GPT-4 – and used writing items from a standardized English proficiency test. Eight detectors were built and evaluated. Six were trained on three training sets (human-written essays combined with either GPT-3.5-generated essays, or GPT-4-generated essays, or both) using two training approaches (feature-based machine learning and fine-tuning RoBERTa), and the remaining two were ensemble detectors. Results showed that a) fine-tuned detectors outperformed feature-based machine learning detectors on all studied metrics; b) detectors trained with essays generated from only one model were more likely to misclassify essays generated by the other model as human-written essays (false negatives), but did not misclassify more human-written essays as AI-generated (false positives); c) the ensemble fine-tuned RoBERTa detector had fewer false positives, but slightly more false negatives than detectors trained with essays generated by both models.</abstract>
+      <url hash="a49eaddc">2025.aimecon-sessions.11</url>
+      <bibkey>zu-etal-2025-effects</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Exploring the Interpretability of <fixed-case>AI</fixed-case>-Generated Response Detection with Probing</title>
+      <author><first>Ikkyu</first><last>Choi</last><affiliation>ETS</affiliation></author>
+      <author><first>Jiyun</first><last>Zu</last><affiliation>ETS</affiliation></author>
+      <pages>99-106</pages>
+      <abstract>Multiple strategies for AI-generated response detection have been proposed, with many high-performing ones built on language models. However, the decision-making processes of these detectors remain largely opaque. We addressed this knowledge gap by fine-tuning a language model for the detection task and applying probing techniques using adversarial examples. Our adversarial probing analysis revealed that the fine-tuned model relied heavily on a narrow set of lexical cues in making the classification decision. These findings underscore the importance of interpretability in AI-generated response detectors and highlight the value of adversarial probing as a tool for exploring model interpretability.</abstract>
+      <url hash="2fa2fae3">2025.aimecon-sessions.12</url>
+      <bibkey>choi-zu-2025-exploring</bibkey>
+    </paper>
+    <paper id="13">
+      <title>A Fairness-Promoting Detection Objective With Applications in <fixed-case>AI</fixed-case>-Assisted Test Security</title>
+      <author><first>Michael</first><last>Fauss</last><affiliation>ETS</affiliation></author>
+      <author><first>Ikkyu</first><last>Choi</last><affiliation>ETS</affiliation></author>
+      <pages>107-114</pages>
+      <abstract>A detection objective based on bounded group-wise false alarm rates is proposed to promote fairness in the context of test fraud detection. The paper begins by outlining key aspects and characteristics that distinguish fairness in test security from fairness in other domains and machine learning in general. The proposed detection objective is then introduced, the corresponding optimal detection policy is derived, and the implications of the results are examined in light of the earlier discussion. A numerical example using synthetic data illustrates the proposed detector and compares its properties to those of a standard likelihood ratio test.</abstract>
+      <url hash="18623abc">2025.aimecon-sessions.13</url>
+      <bibkey>fauss-choi-2025-fairness</bibkey>
+    </paper>
+    <paper id="14">
+      <title>The Impact of an <fixed-case>NLP</fixed-case>-Based Writing Tool on Student Writing</title>
+      <author><first>Karthik</first><last>Sairam</last><affiliation>Cambium Assessment</affiliation></author>
+      <author><first>Amy</first><last>Burkhardt</last><affiliation>Cambium Assessment</affiliation></author>
+      <author><first>Susan</first><last>Lottridge</last><affiliation>Cambium Assessment</affiliation></author>
+      <pages>115-123</pages>
+      <abstract>We present preliminary evidence on the impact of a NLP-based writing feedback tool, Write-On with Cambi! on students’ argumentative writing. Students were randomly assigned to receive access to the tool or not, and their essay scores were compared across three rubric dimensions; estimated effect sizes (Cohen’s d) ranged from 0.25 to 0.26 (with notable variation in the average treatment effect across classrooms). To characterize and compare the groups’ writing processes, we implemented an algorithm that classified each revision as Appended (new text added to the end), Surface-level (minor within-text corrections to conventions), or Substantive (larger within-text changes or additions). We interpret within-text edits (Surface-level or Substantive) as potential markers of metacognitive engagement in revision, and note that these within-text edits are more common in students who had access to the tool. Together, these pilot analyses serve as a first step in testing the tool’s theory of action.</abstract>
+      <url hash="0251dbb2">2025.aimecon-sessions.14</url>
+      <bibkey>sairam-etal-2025-impact</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/yaml/venues/aimecon.yaml b/data/yaml/venues/aimecon.yaml
new file mode 100644
index 0000000000..7a942ed59f
--- /dev/null
+++ b/data/yaml/venues/aimecon.yaml
@@ -0,0 +1,2 @@
+acronym: AIME-Con
+name: Artificial Intelligence in Measurement and Education Conference (AIME-Con)