Merge pull request #8 from run-llama/clelia/larger-scale-eval

chore: larger scale evals
This commit is contained in:
Clelia (Astra) Bertelli
2026-01-12 14:58:51 +01:00
committed by GitHub
28 changed files with 2070 additions and 51 deletions
+5 -1
View File
@@ -15,4 +15,8 @@ tmp/
qdrant_storage/
# logs
fs-explorer.log
fs-explorer.log
rag-starterkit.log
# benchmark with many more data
texts/
+3
View File
@@ -35,3 +35,6 @@ In the [`packages`](./packages/) directory we have two packages:
- [**rag-starterkit**](./packages/rag-starterkit/): a traditional RAG application implementing hybrid search. (More in the [dedicated README](./packages/rag-starterkit/README.md)).
- [**eval-framework**](./packages/eval-framework/): a simple evaluation framework that produces LLM-as-a-judge-based evals along with collecting time-, tool usage- and file-search-based statistics. Find out how to run the evaluation and collect statistics in the [dedicated README](./packages/eval-framework/README.md).
- [**cache-arxiv**](./packages/cache-arxiv/): a utility package to cache arXiv papers for larger scale experiments.
Find the evaluation guide [here](./packages/eval-framework/README.md).
@@ -0,0 +1,100 @@
{"id":"http://arxiv.org/abs/2601.06022v1","updated":"2026-01-09T18:58:22Z","published":"2026-01-09T18:58:22Z","title":"AdaFuse: Adaptive Ensemble Decoding with Test-Time Scaling for LLMs","authors":["Chengming Cui","Tianxin Wei","Ziyi Chen","Ruizhong Qiu","Zhichen Zeng","Zhining Liu","Xuying Ning","Duo Zhou","Jingrui He"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.06022v1.pdf","html_url":"https://arxiv.org/abs/2601.06022v1","comment":null}
{"id":"http://arxiv.org/abs/2601.06002v1","updated":"2026-01-09T18:39:01Z","published":"2026-01-09T18:39:01Z","title":"The Molecular Structure of Thought: Mapping the Topology of Long Chain-of-Thought Reasoning","authors":["Qiguang Chen","Yantao Du","Ziniu Li","Jinhao Liu","Songyao Duan","Jiarui Guo","Minghao Liu","Jiaheng Liu","Tong Yang","Ge Zhang","Libo Qin","Wanxiang Che","Wenhao Huang"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.06002v1.pdf","html_url":"https://arxiv.org/abs/2601.06002v1","comment":"Preprint"}
{"id":"http://arxiv.org/abs/2601.05991v1","updated":"2026-01-09T18:17:11Z","published":"2026-01-09T18:17:11Z","title":"Open-Vocabulary 3D Instruction Ambiguity Detection","authors":["Jiayu Ding","Haoran Tang","Ge Li"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05991v1.pdf","html_url":"https://arxiv.org/abs/2601.05991v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05966v1","updated":"2026-01-09T17:34:59Z","published":"2026-01-09T17:34:59Z","title":"VideoAR: Autoregressive Video Generation via Next-Frame & Scale Prediction","authors":["Longbin Ji","Xiaoxiong Liu","Junyuan Shang","Shuohuan Wang","Yu Sun","Hua Wu","Haifeng Wang"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05966v1.pdf","html_url":"https://arxiv.org/abs/2601.05966v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05937v1","updated":"2026-01-09T16:48:50Z","published":"2026-01-09T16:48:50Z","title":"Performance of a Deep Learning-Based Segmentation Model for Pancreatic Tumors on Public Endoscopic Ultrasound Datasets","authors":["Pankaj Gupta","Priya Mudgil","Niharika Dutta","Kartik Bose","Nitish Kumar","Anupam Kumar","Jimil Shah","Vaneet Jearth","Jayanta Samanta","Vishal Sharma","Harshal Mandavdhare","Surinder Rana","Saroj K Sinha","Usha Dutta"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05937v1.pdf","html_url":"https://arxiv.org/abs/2601.05937v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05930v1","updated":"2026-01-09T16:44:17Z","published":"2026-01-09T16:44:17Z","title":"Can We Predict Before Executing Machine Learning Agents?","authors":["Jingsheng Zheng","Jintian Zhang","Yujie Luo","Yuren Mao","Yunjun Gao","Lun Du","Huajun Chen","Ningyu Zhang"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.LG","cs.MA"],"pdf_url":"https://arxiv.org/pdf/2601.05930v1.pdf","html_url":"https://arxiv.org/abs/2601.05930v1","comment":"Work in progress"}
{"id":"http://arxiv.org/abs/2601.05923v1","updated":"2026-01-09T16:37:48Z","published":"2026-01-09T16:37:48Z","title":"Cedalion Tutorial: A Python-based framework for comprehensive analysis of multimodal fNIRS & DOT from the lab to the everyday world","authors":["E. Middell","L. Carlton","S. Moradi","T. Codina","T. Fischer","J. Cutler","S. Kelley","J. Behrendt","T. Dissanayake","N. Harmening","M. A. Yücel","D. A. Boas","A. von Lühmann"],"primary_category":"eess.SP","categories":["eess.SP","cs.AI","cs.LG","eess.IV","q-bio.QM"],"pdf_url":"https://arxiv.org/pdf/2601.05923v1.pdf","html_url":"https://arxiv.org/abs/2601.05923v1","comment":"33 pages main manuscript, 180 pages Supplementary Tutorial Notebooks, 12 figures, 6 tables, under review in SPIE Neurophotonics"}
{"id":"http://arxiv.org/abs/2601.05918v1","updated":"2026-01-09T16:32:33Z","published":"2026-01-09T16:32:33Z","title":"Agentic LLMs as Powerful Deanonymizers: Re-identification of Participants in the Anthropic Interviewer Dataset","authors":["Tianshi Li"],"primary_category":"cs.CR","categories":["cs.CR","cs.AI","cs.CY"],"pdf_url":"https://arxiv.org/pdf/2601.05918v1.pdf","html_url":"https://arxiv.org/abs/2601.05918v1","comment":"4 pages"}
{"id":"http://arxiv.org/abs/2601.05909v1","updated":"2026-01-09T16:28:11Z","published":"2026-01-09T16:28:11Z","title":"Auditing Fairness under Model Updates: Fundamental Complexity and Property-Preserving Updates","authors":["Ayoub Ajarra","Debabrota Basu"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI","cs.CY","stat.ML"],"pdf_url":"https://arxiv.org/pdf/2601.05909v1.pdf","html_url":"https://arxiv.org/abs/2601.05909v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05905v1","updated":"2026-01-09T16:23:21Z","published":"2026-01-09T16:23:21Z","title":"Illusions of Confidence? Diagnosing LLM Truthfulness via Neighborhood Consistency","authors":["Haoming Xu","Ningyuan Zhao","Yunzhi Yao","Weihong Xu","Hongru Wang","Xinle Deng","Shumin Deng","Jeff Z. Pan","Huajun Chen","Ningyu Zhang"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.HC","cs.LG","cs.MA"],"pdf_url":"https://arxiv.org/pdf/2601.05905v1.pdf","html_url":"https://arxiv.org/abs/2601.05905v1","comment":"Work in progress"}
{"id":"http://arxiv.org/abs/2601.05904v1","updated":"2026-01-09T16:22:26Z","published":"2026-01-09T16:22:26Z","title":"Can AI mediation improve democratic deliberation?","authors":["Michael Henry Tessler","Georgina Evans","Michiel A. Bakker","Iason Gabriel","Sophie Bridgers","Rishub Jain","Raphael Koster","Verena Rieser","Anca Dragan","Matthew Botvinick","Christopher Summerfield"],"primary_category":"cs.CY","categories":["cs.CY","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05904v1.pdf","html_url":"https://arxiv.org/abs/2601.05904v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05899v1","updated":"2026-01-09T16:18:08Z","published":"2026-01-09T16:18:08Z","title":"TowerMind: A Tower Defence Game Learning Environment and Benchmark for LLM as Agents","authors":["Dawei Wang","Chengming Zhou","Di Zhao","Xinyuan Liu","Marci Chi Ma","Gary Ushaw","Richard Davison"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05899v1.pdf","html_url":"https://arxiv.org/abs/2601.05899v1","comment":"AAAI 2026 Oral"}
{"id":"http://arxiv.org/abs/2601.05890v1","updated":"2026-01-09T16:09:48Z","published":"2026-01-09T16:09:48Z","title":"StackPlanner: A Centralized Hierarchical Multi-Agent System with Task-Experience Memory Management","authors":["Ruizhe Zhang","Xinke Jiang","Zhibang Yang","Zhixin Zhang","Jiaran Gao","Yuzhen Xiao","Hongbin Lai","Xu Chu","Junfeng Zhao","Yasha Wang"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05890v1.pdf","html_url":"https://arxiv.org/abs/2601.05890v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05882v1","updated":"2026-01-09T15:56:55Z","published":"2026-01-09T15:56:55Z","title":"An Empirical Study on Preference Tuning Generalization and Diversity Under Domain Shift","authors":["Constantinos Karouzos","Xingwei Tan","Nikolaos Aletras"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05882v1.pdf","html_url":"https://arxiv.org/abs/2601.05882v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05879v1","updated":"2026-01-09T15:55:03Z","published":"2026-01-09T15:55:03Z","title":"Gender Bias in LLMs: Preliminary Evidence from Shared Parenting Scenario in Czech Family Law","authors":["Jakub Harasta","Matej Vasina","Martin Kornel","Tomas Foltynek"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.CY"],"pdf_url":"https://arxiv.org/pdf/2601.05879v1.pdf","html_url":"https://arxiv.org/abs/2601.05879v1","comment":"Accepted at AI for Access to Justice, Dispute Resolution, and Data Access (AIDA2J) at Jurix 2025, Torino, Italy"}
{"id":"http://arxiv.org/abs/2601.05874v1","updated":"2026-01-09T15:51:12Z","published":"2026-01-09T15:51:12Z","title":"Continual-learning for Modelling Low-Resource Languages from Large Language Models","authors":["Santosh Srinath K","Mudit Somani","Varun Reddy Padala","Prajna Devi Upadhyay","Abhijit Das"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05874v1.pdf","html_url":"https://arxiv.org/abs/2601.05874v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05870v1","updated":"2026-01-09T15:46:40Z","published":"2026-01-09T15:46:40Z","title":"IIB-LPO: Latent Policy Optimization via Iterative Information Bottleneck","authors":["Huilin Deng","Hongchen Luo","Yue Zhu","Long Li","Zhuoyue Chen","Xinghao Zhao","Ming Li","Jihai Zhang","Mengchang Wang","Yang Cao","Yu Kang"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05870v1.pdf","html_url":"https://arxiv.org/abs/2601.05870v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05858v1","updated":"2026-01-09T15:34:31Z","published":"2026-01-09T15:34:31Z","title":"CLewR: Curriculum Learning with Restarts for Machine Translation Preference Learning","authors":["Alexandra Dragomir","Florin Brad","Radu Tudor Ionescu"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05858v1.pdf","html_url":"https://arxiv.org/abs/2601.05858v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05853v1","updated":"2026-01-09T15:30:12Z","published":"2026-01-09T15:30:12Z","title":"LayerGS: Decomposition and Inpainting of Layered 3D Human Avatars via 2D Gaussian Splatting","authors":["Yinghan Xu","John Dingliana"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI","cs.GR"],"pdf_url":"https://arxiv.org/pdf/2601.05853v1.pdf","html_url":"https://arxiv.org/abs/2601.05853v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05851v1","updated":"2026-01-09T15:29:50Z","published":"2026-01-09T15:29:50Z","title":"Router-Suggest: Dynamic Routing for Multimodal Auto-Completion in Visually-Grounded Dialogs","authors":["Sandeep Mishra","Devichand Budagam","Anubhab Mandal","Bishal Santra","Pawan Goyal","Manish Gupta"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.CV"],"pdf_url":"https://arxiv.org/pdf/2601.05851v1.pdf","html_url":"https://arxiv.org/abs/2601.05851v1","comment":"Accepted to EACL 2026 Industry Track, 12 pages, 6 figures"}
{"id":"http://arxiv.org/abs/2601.05848v1","updated":"2026-01-09T15:23:36Z","published":"2026-01-09T15:23:36Z","title":"Goal Force: Teaching Video Models To Accomplish Physics-Conditioned Goals","authors":["Nate Gillman","Yinghua Zhou","Zitian Tang","Evan Luo","Arjan Chakravarthy","Daksh Aggarwal","Michael Freeman","Charles Herrmann","Chen Sun"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI","cs.RO"],"pdf_url":"https://arxiv.org/pdf/2601.05848v1.pdf","html_url":"https://arxiv.org/abs/2601.05848v1","comment":"Code and interactive demos at https://goal-force.github.io/"}
{"id":"http://arxiv.org/abs/2601.05844v1","updated":"2026-01-09T15:16:31Z","published":"2026-01-09T15:16:31Z","title":"DexterCap: An Affordable and Automated System for Capturing Dexterous Hand-Object Manipulation","authors":["Yutong Liang","Shiyi Xu","Yulong Zhang","Bowen Zhan","He Zhang","Libin Liu"],"primary_category":"cs.GR","categories":["cs.GR","cs.AI","cs.RO"],"pdf_url":"https://arxiv.org/pdf/2601.05844v1.pdf","html_url":"https://arxiv.org/abs/2601.05844v1","comment":"12 pages, 12 figures"}
{"id":"http://arxiv.org/abs/2601.05836v1","updated":"2026-01-09T15:10:23Z","published":"2026-01-09T15:10:23Z","title":"Intelligent Singularity Avoidance in UR10 Robotic Arm Path Planning Using Hybrid Fuzzy Logic and Reinforcement Learning","authors":["Sheng-Kai Chen","Jyh-Horng Wu"],"primary_category":"cs.RO","categories":["cs.RO","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05836v1.pdf","html_url":"https://arxiv.org/abs/2601.05836v1","comment":"Published in TANET 2025 (Paper No. T0404)"}
{"id":"http://arxiv.org/abs/2601.05828v1","updated":"2026-01-09T15:01:47Z","published":"2026-01-09T15:01:47Z","title":"Influence of Parallelism in Vector-Multiplication Units on Correlation Power Analysis","authors":["Manuel Brosch","Matthias Probst","Stefan Kögler","Georg Sigl"],"primary_category":"cs.CR","categories":["cs.CR","cs.AI","cs.IR"],"pdf_url":"https://arxiv.org/pdf/2601.05828v1.pdf","html_url":"https://arxiv.org/abs/2601.05828v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05825v1","updated":"2026-01-09T14:59:25Z","published":"2026-01-09T14:59:25Z","title":"Decoding Workload and Agreement From EEG During Spoken Dialogue With Conversational AI","authors":["Lucija Mihić Zidar","Philipp Wicke","Praneel Bhatia","Rosa Lutz","Marius Klug","Thorsten O. Zander"],"primary_category":"cs.HC","categories":["cs.HC","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05825v1.pdf","html_url":"https://arxiv.org/abs/2601.05825v1","comment":"Accepted at the 14th International Winter Conference on Brain-Computer Interface"}
{"id":"http://arxiv.org/abs/2601.05810v1","updated":"2026-01-09T14:33:10Z","published":"2026-01-09T14:33:10Z","title":"SceneFoundry: Generating Interactive Infinite 3D Worlds","authors":["ChunTeng Chen","YiChen Hsu","YiWen Liu","WeiFang Sun","TsaiChing Ni","ChunYi Lee","Min Sun","YuanFu Yang"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI","cs.LG","cs.RO"],"pdf_url":"https://arxiv.org/pdf/2601.05810v1.pdf","html_url":"https://arxiv.org/abs/2601.05810v1","comment":"15 pages"}
{"id":"http://arxiv.org/abs/2601.05808v1","updated":"2026-01-09T14:32:06Z","published":"2026-01-09T14:32:06Z","title":"EnvScaler: Scaling Tool-Interactive Environments for LLM Agent via Programmatic Synthesis","authors":["Xiaoshuai Song","Haofei Chang","Guanting Dong","Yutao Zhu","Zhicheng Dou","Ji-Rong Wen"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05808v1.pdf","html_url":"https://arxiv.org/abs/2601.05808v1","comment":"Working in progress"}
{"id":"http://arxiv.org/abs/2601.05792v1","updated":"2026-01-09T13:39:49Z","published":"2026-01-09T13:39:49Z","title":"Tensor-DTI: Enhancing Biomolecular Interaction Prediction with Contrastive Embedding Learning","authors":["Manel Gil-Sorribes","Júlia Vilalta-Mor","Isaac Filella-Mercè","Robert Soliva","Álvaro Ciudad","Víctor Guallar","Alexis Molina"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI","q-bio.BM"],"pdf_url":"https://arxiv.org/pdf/2601.05792v1.pdf","html_url":"https://arxiv.org/abs/2601.05792v1","comment":"Accepted at the Generative and Experimental Perspectives for Biomolecular Design Workshop at ICLR 2025 and at the Learning Meaningful Representations of Life Workshop at ICLR 2025"}
{"id":"http://arxiv.org/abs/2601.05789v1","updated":"2026-01-09T13:29:41Z","published":"2026-01-09T13:29:41Z","title":"SAFE: Secure and Accurate Federated Learning for Privacy-Preserving Brain-Computer Interfaces","authors":["Tianwang Jia","Xiaoqing Chen","Dongrui Wu"],"primary_category":"cs.HC","categories":["cs.HC","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05789v1.pdf","html_url":"https://arxiv.org/abs/2601.05789v1","comment":"12 pages, 9 figures"}
{"id":"http://arxiv.org/abs/2601.05787v1","updated":"2026-01-09T13:26:38Z","published":"2026-01-09T13:26:38Z","title":"From Off-Policy to On-Policy: Enhancing GUI Agents via Bi-level Expert-to-Policy Assimilation","authors":["Zezhou Wang","Ziyun Zhang","Xiaoyi Zhang","Zhuzhong Qian","Yan Lu"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05787v1.pdf","html_url":"https://arxiv.org/abs/2601.05787v1","comment":"Work In Progress"}
{"id":"http://arxiv.org/abs/2601.05785v1","updated":"2026-01-09T13:22:37Z","published":"2026-01-09T13:22:37Z","title":"Adaptive Disentangled Representation Learning for Incomplete Multi-View Multi-Label Classification","authors":["Quanjiang Li","Zhiming Liu","Tianxiang Xu","Tingjin Luo","Chenping Hou"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05785v1.pdf","html_url":"https://arxiv.org/abs/2601.05785v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05759v1","updated":"2026-01-09T12:28:02Z","published":"2026-01-09T12:28:02Z","title":"Variational Autoencoders for P-wave Detection on Strong Motion Earthquake Spectrograms","authors":["Turkan Simge Ispak","Salih Tileylioglu","Erdem Akagunduz"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05759v1.pdf","html_url":"https://arxiv.org/abs/2601.05759v1","comment":"13 pages, 8 figures, 3 tables"}
{"id":"http://arxiv.org/abs/2601.05755v1","updated":"2026-01-09T12:19:49Z","published":"2026-01-09T12:19:49Z","title":"VIGIL: Defending LLM Agents Against Tool Stream Injection via Verify-Before-Commit","authors":["Junda Lin","Zhaomeng Zhou","Zhi Zheng","Shuochen Liu","Tong Xu","Yong Chen","Enhong Chen"],"primary_category":"cs.CR","categories":["cs.CR","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05755v1.pdf","html_url":"https://arxiv.org/abs/2601.05755v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05751v1","updated":"2026-01-09T12:07:38Z","published":"2026-01-09T12:07:38Z","title":"Analysing Differences in Persuasive Language in LLM-Generated Text: Uncovering Stereotypical Gender Patterns","authors":["Amalie Brogaard Pauli","Maria Barrett","Max Müller-Eberstein","Isabelle Augenstein","Ira Assent"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05751v1.pdf","html_url":"https://arxiv.org/abs/2601.05751v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05746v1","updated":"2026-01-09T12:01:33Z","published":"2026-01-09T12:01:33Z","title":"DynaDebate: Breaking Homogeneity in Multi-Agent Debate with Dynamic Path Generation","authors":["Zhenghao Li","Zhi Zheng","Wei Chen","Jielun Zhao","Yong Chen","Tong Xu","Enhong Chen"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05746v1.pdf","html_url":"https://arxiv.org/abs/2601.05746v1","comment":"16pages,6figures"}
{"id":"http://arxiv.org/abs/2601.05742v1","updated":"2026-01-09T11:46:32Z","published":"2026-01-09T11:46:32Z","title":"The Echo Chamber Multi-Turn LLM Jailbreak","authors":["Ahmad Alobaid","Martí Jordà Roca","Carlos Castillo","Joan Vendrell"],"primary_category":"cs.CR","categories":["cs.CR","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05742v1.pdf","html_url":"https://arxiv.org/abs/2601.05742v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05739v1","updated":"2026-01-09T11:40:56Z","published":"2026-01-09T11:40:56Z","title":"PII-VisBench: Evaluating Personally Identifiable Information Safety in Vision Language Models Along a Continuum of Visibility","authors":["G M Shahariar","Zabir Al Nazi","Md Olid Hasan Bhuiyan","Zhouxing Shi"],"primary_category":"cs.AI","categories":["cs.AI","cs.CL","cs.CR","cs.CV"],"pdf_url":"https://arxiv.org/pdf/2601.05739v1.pdf","html_url":"https://arxiv.org/abs/2601.05739v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05732v1","updated":"2026-01-09T11:19:14Z","published":"2026-01-09T11:19:14Z","title":"mHC-lite: You Don't Need 20 Sinkhorn-Knopp Iterations","authors":["Yongyi Yang","Jianyang Gao"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05732v1.pdf","html_url":"https://arxiv.org/abs/2601.05732v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05724v1","updated":"2026-01-09T11:10:29Z","published":"2026-01-09T11:10:29Z","title":"Overcoming Joint Intractability with Lossless Hierarchical Speculative Decoding","authors":["Yuxuan Zhou","Fei Huang","Heng Li","Fengyi Wu","Tianyu Wang","Jianwei Zhang","Junyang Lin","Zhi-Qi Cheng"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05724v1.pdf","html_url":"https://arxiv.org/abs/2601.05724v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05713v1","updated":"2026-01-09T10:58:17Z","published":"2026-01-09T10:58:17Z","title":"Visualising Information Flow in Word Embeddings with Diffusion Tensor Imaging","authors":["Thomas Fabian"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05713v1.pdf","html_url":"https://arxiv.org/abs/2601.05713v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05707v1","updated":"2026-01-09T10:52:23Z","published":"2026-01-09T10:52:23Z","title":"Multimodal In-context Learning for ASR of Low-resource Languages","authors":["Zhaolin Li","Jan Niehues"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05707v1.pdf","html_url":"https://arxiv.org/abs/2601.05707v1","comment":"Under review"}
{"id":"http://arxiv.org/abs/2601.05705v1","updated":"2026-01-09T10:47:30Z","published":"2026-01-09T10:47:30Z","title":"Logic-Parametric Neuro-Symbolic NLI: Controlling Logical Formalisms for Verifiable LLM Reasoning","authors":["Ali Farjami","Luca Redondi","Marco Valentino"],"primary_category":"cs.AI","categories":["cs.AI","cs.CL","cs.LO"],"pdf_url":"https://arxiv.org/pdf/2601.05705v1.pdf","html_url":"https://arxiv.org/abs/2601.05705v1","comment":"Work in progress"}
{"id":"http://arxiv.org/abs/2601.05703v1","updated":"2026-01-09T10:46:42Z","published":"2026-01-09T10:46:42Z","title":"AIBoMGen: Generating an AI Bill of Materials for Secure, Transparent, and Compliant Model Training","authors":["Wiebe Vandendriessche","Jordi Thijsman","Laurens D'hooge","Bruno Volckaert","Merlijn Sebrechts"],"primary_category":"cs.SE","categories":["cs.SE","cs.AI","cs.CR"],"pdf_url":"https://arxiv.org/pdf/2601.05703v1.pdf","html_url":"https://arxiv.org/abs/2601.05703v1","comment":"Accepted at ACM/IEEE CAIN 2026"}
{"id":"http://arxiv.org/abs/2601.05693v1","updated":"2026-01-09T10:23:55Z","published":"2026-01-09T10:23:55Z","title":"Circular Reasoning: Understanding Self-Reinforcing Loops in Large Reasoning Models","authors":["Zenghao Duan","Liang Pang","Zihao Wei","Wenbin Duan","Yuxin Tian","Shicheng Xu","Jingcheng Deng","Zhiyi Yin","Xueqi Cheng"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05693v1.pdf","html_url":"https://arxiv.org/abs/2601.05693v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05683v1","updated":"2026-01-09T10:05:07Z","published":"2026-01-09T10:05:07Z","title":"Joint Optimization of Neural Autoregressors via Scoring rules","authors":["Jonas Landsgesell"],"primary_category":"cond-mat.soft","categories":["cond-mat.soft","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05683v1.pdf","html_url":"https://arxiv.org/abs/2601.05683v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05680v1","updated":"2026-01-09T09:57:12Z","published":"2026-01-09T09:57:12Z","title":"AGDC: Autoregressive Generation of Variable-Length Sequences with Joint Discrete and Continuous Spaces","authors":["Yeonsang Shin","Insoo Kim","Bongkeun Kim","Keonwoo Bae","Bohyung Han"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI","cs.CV"],"pdf_url":"https://arxiv.org/pdf/2601.05680v1.pdf","html_url":"https://arxiv.org/abs/2601.05680v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05675v1","updated":"2026-01-09T09:50:47Z","published":"2026-01-09T09:50:47Z","title":"CHDP: Cooperative Hybrid Diffusion Policies for Reinforcement Learning in Parameterized Action Space","authors":["Bingyi Liu","Jinbo He","Haiyong Shi","Enshu Wang","Weizhen Han","Jingxiang Hao","Peixi Wang","Zhuangzhuang Zhang"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05675v1.pdf","html_url":"https://arxiv.org/abs/2601.05675v1","comment":"Accepted by AAAI 2026"}
{"id":"http://arxiv.org/abs/2601.05666v1","updated":"2026-01-09T09:39:12Z","published":"2026-01-09T09:39:12Z","title":"Advancing credit mobility through stakeholder-informed AI design and adoption","authors":["Yerin Kwak","Siddharth Adelkar","Zachary A. Pardos"],"primary_category":"cs.HC","categories":["cs.HC","cs.AI","cs.CY"],"pdf_url":"https://arxiv.org/pdf/2601.05666v1.pdf","html_url":"https://arxiv.org/abs/2601.05666v1","comment":"17 pages, 8 figures"}
{"id":"http://arxiv.org/abs/2601.05657v1","updated":"2026-01-09T09:27:17Z","published":"2026-01-09T09:27:17Z","title":"Stephanie2: Thinking, Waiting, and Making Decisions Like Humans in Step-by-Step AI Social Chat","authors":["Hao Yang","Hongyuan Lu","Dingkang Yang","Wenliang Yang","Peng Sun","Xiaochuan Zhang","Jun Xiao","Kefan He","Wai Lam","Yang Liu","Xinhua Zeng"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05657v1.pdf","html_url":"https://arxiv.org/abs/2601.05657v1","comment":"13 pages"}
{"id":"http://arxiv.org/abs/2601.05656v1","updated":"2026-01-09T09:26:08Z","published":"2026-01-09T09:26:08Z","title":"HAG: Hierarchical Demographic Tree-based Agent Generation for Topic-Adaptive Simulation","authors":["Rongxin Chen","Tianyu Wu","Bingbing Xu","Xiucheng Xu","Huawei Shen"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05656v1.pdf","html_url":"https://arxiv.org/abs/2601.05656v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05654v1","updated":"2026-01-09T09:22:31Z","published":"2026-01-09T09:22:31Z","title":"A Framework for Personalized Persuasiveness Prediction via Context-Aware User Profiling","authors":["Sejun Park","Yoonah Park","Jongwon Lim","Yohan Jo"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05654v1.pdf","html_url":"https://arxiv.org/abs/2601.05654v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05648v1","updated":"2026-01-09T09:10:14Z","published":"2026-01-09T09:10:14Z","title":"Open World Knowledge Aided Single-Cell Foundation Model with Robust Cross-Modal Cell-Language Pre-training","authors":["Haoran Wang","Xuanyi Zhang","Shuangsang Fang","Longke Ran","Ziqing Deng","Yong Zhang","Yuxiang Li","Shaoshuai Li"],"primary_category":"q-bio.GN","categories":["q-bio.GN","cs.AI","cs.CL","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05648v1.pdf","html_url":"https://arxiv.org/abs/2601.05648v1","comment":"41 pages"}
{"id":"http://arxiv.org/abs/2601.05647v1","updated":"2026-01-09T09:10:04Z","published":"2026-01-09T09:10:04Z","title":"Transformer Is Inherently a Causal Learner","authors":["Xinyue Wang","Stephen Wang","Biwei Huang"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05647v1.pdf","html_url":"https://arxiv.org/abs/2601.05647v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05637v1","updated":"2026-01-09T08:50:02Z","published":"2026-01-09T08:50:02Z","title":"GenCtrl -- A Formal Controllability Toolkit for Generative Models","authors":["Emily Cheng","Carmen Amo Alonso","Federico Danieli","Arno Blaas","Luca Zappella","Pau Rodriguez","Xavier Suau"],"primary_category":"cs.AI","categories":["cs.AI","cs.LG","eess.SY"],"pdf_url":"https://arxiv.org/pdf/2601.05637v1.pdf","html_url":"https://arxiv.org/abs/2601.05637v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05629v1","updated":"2026-01-09T08:34:05Z","published":"2026-01-09T08:34:05Z","title":"Cumulative Path-Level Semantic Reasoning for Inductive Knowledge Graph Completion","authors":["Jiapu Wang","Xinghe Cheng","Zezheng Wu","Ruiqi Ma","Rui Wang","Zhichao Yan","Haoran Luo","Yuhao Jiang","Kai Sun"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05629v1.pdf","html_url":"https://arxiv.org/abs/2601.05629v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05613v1","updated":"2026-01-09T08:11:45Z","published":"2026-01-09T08:11:45Z","title":"PiXTime: A Model for Federated Time Series Forecasting with Heterogeneous Data Structures Across Nodes","authors":["Yiming Zhou","Mingyue Cheng","Hao Wang","Enhong Chen"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05613v1.pdf","html_url":"https://arxiv.org/abs/2601.05613v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05590v1","updated":"2026-01-09T07:19:35Z","published":"2026-01-09T07:19:35Z","title":"A Causal Information-Flow Framework for Unbiased Learning-to-Rank","authors":["Haoming Gong","Qingyao Ai","Zhihao Tao","Yongfeng Zhang"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05590v1.pdf","html_url":"https://arxiv.org/abs/2601.05590v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05589v1","updated":"2026-01-09T07:17:28Z","published":"2026-01-09T07:17:28Z","title":"ACR: Adaptive Context Refactoring via Context Refactoring Operators for Multi-Turn Dialogue","authors":["Jiawei Shen","Jia Zhu","Hanghui Guo","Weijie Shi","Yue Cui","Qingyu Niu","Guoqing Ma","Yidan Liang","Jingjiang Liu","Yiling Wang","Shimin Di","Jiajie Xu"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05589v1.pdf","html_url":"https://arxiv.org/abs/2601.05589v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05588v1","updated":"2026-01-09T07:16:28Z","published":"2026-01-09T07:16:28Z","title":"Autoregressive Ranking: Bridging the Gap Between Dual and Cross Encoders","authors":["Benjamin Rozonoyer","Chong You","Michael Boratko","Himanshu Jain","Nilesh Gupta","Srinadh Bhojanapalli","Andrew McCallum","Felix Yu"],"primary_category":"cs.IR","categories":["cs.IR","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05588v1.pdf","html_url":"https://arxiv.org/abs/2601.05588v1","comment":"22 pages, 5 figures"}
{"id":"http://arxiv.org/abs/2601.05587v1","updated":"2026-01-09T07:14:29Z","published":"2026-01-09T07:14:29Z","title":"HogVul: Black-box Adversarial Code Generation Framework Against LM-based Vulnerability Detectors","authors":["Jingxiao Yang","Ping He","Tianyu Du","Sun Bing","Xuhong Zhang"],"primary_category":"cs.CR","categories":["cs.CR","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05587v1.pdf","html_url":"https://arxiv.org/abs/2601.05587v1","comment":"AAAI26"}
{"id":"http://arxiv.org/abs/2601.05584v1","updated":"2026-01-09T07:12:23Z","published":"2026-01-09T07:12:23Z","title":"GS-DMSR: Dynamic Sensitive Multi-scale Manifold Enhancement for Accelerated High-Quality 3D Gaussian Splatting","authors":["Nengbo Lu","Minghua Pan","Shaohua Sun","Yizhou Liang"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05584v1.pdf","html_url":"https://arxiv.org/abs/2601.05584v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05579v1","updated":"2026-01-09T07:00:44Z","published":"2026-01-09T07:00:44Z","title":"RISE: Rule-Driven SQL Dialect Translation via Query Reduction","authors":["Xudong Xie","Yuwei Zhang","Wensheng Dou","Yu Gao","Ziyu Cui","Jiansen Song","Rui Yang","Jun Wei"],"primary_category":"cs.DB","categories":["cs.DB","cs.AI","cs.CL","cs.SE"],"pdf_url":"https://arxiv.org/pdf/2601.05579v1.pdf","html_url":"https://arxiv.org/abs/2601.05579v1","comment":"Accepted by ICSE 2026"}
{"id":"http://arxiv.org/abs/2601.05578v1","updated":"2026-01-09T06:56:27Z","published":"2026-01-09T06:56:27Z","title":"Reinforcement Learning of Large Language Models for Interpretable Credit Card Fraud Detection","authors":["Cooper Lin","Yanting Zhang","Maohao Ran","Wei Xue","Hongwei Fan","Yibo Xu","Zhenglin Wan","Sirui Han","Yike Guo","Jun Song"],"primary_category":"cs.AI","categories":["cs.AI","cs.CE"],"pdf_url":"https://arxiv.org/pdf/2601.05578v1.pdf","html_url":"https://arxiv.org/abs/2601.05578v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05570v1","updated":"2026-01-09T06:41:49Z","published":"2026-01-09T06:41:49Z","title":"Crisis-Bench: Benchmarking Strategic Ambiguity and Reputation Management in Large Language Models","authors":["Cooper Lin","Maohao Ran","Yanting Zhang","Zhenglin Wan","Hongwei Fan","Yibo Xu","Yike Guo","Wei Xue","Jun Song"],"primary_category":"cs.AI","categories":["cs.AI","cs.MA"],"pdf_url":"https://arxiv.org/pdf/2601.05570v1.pdf","html_url":"https://arxiv.org/abs/2601.05570v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05567v1","updated":"2026-01-09T06:35:23Z","published":"2026-01-09T06:35:23Z","title":"WildSci: Advancing Scientific Reasoning from In-the-Wild Literature","authors":["Tengxiao Liu","Deepak Nathani","Zekun Li","Kevin Yang","William Yang Wang"],"primary_category":"cs.AI","categories":["cs.AI","cs.CL"],"pdf_url":"https://arxiv.org/pdf/2601.05567v1.pdf","html_url":"https://arxiv.org/abs/2601.05567v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05560v1","updated":"2026-01-09T06:19:00Z","published":"2026-01-09T06:19:00Z","title":"ReasonAny: Incorporating Reasoning Capability to Any Model via Simple and Effective Model Merging","authors":["Junyao Yang","Chen Qian","Dongrui Liu","Wen Shen","Yong Liu","Jing Shao"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05560v1.pdf","html_url":"https://arxiv.org/abs/2601.05560v1","comment":"22 pages, 6 figures, 14 tables"}
{"id":"http://arxiv.org/abs/2601.05556v1","updated":"2026-01-09T06:13:53Z","published":"2026-01-09T06:13:53Z","title":"Semi-Supervised Facial Expression Recognition based on Dynamic Threshold and Negative Learning","authors":["Zhongpeng Cai","Jun Yu","Wei Xu","Tianyu Liu","Jianqing Sun","Jiaen Liang"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05556v1.pdf","html_url":"https://arxiv.org/abs/2601.05556v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05547v1","updated":"2026-01-09T05:58:22Z","published":"2026-01-09T05:58:22Z","title":"VIB-Probe: Detecting and Mitigating Hallucinations in Vision-Language Models via Variational Information Bottleneck","authors":["Feiran Zhang","Yixin Wu","Zhenghua Wang","Xiaohua Wang","Changze Lv","Xuanjing Huang","Xiaoqing Zheng"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05547v1.pdf","html_url":"https://arxiv.org/abs/2601.05547v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05542v1","updated":"2026-01-09T05:51:35Z","published":"2026-01-09T05:51:35Z","title":"Understanding LLM-Driven Test Oracle Generation","authors":["Adam Bodicoat","Gunel Jahangirova","Valerio Terragni"],"primary_category":"cs.SE","categories":["cs.SE","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05542v1.pdf","html_url":"https://arxiv.org/abs/2601.05542v1","comment":"Accepted for presentation at the 2nd ACM/IEEE International Conference on AI-powered Software (AIware 2025)"}
{"id":"http://arxiv.org/abs/2601.05537v1","updated":"2026-01-09T05:23:59Z","published":"2026-01-09T05:23:59Z","title":"Scalable Heterogeneous Graph Learning via Heterogeneous-aware Orthogonal Prototype Experts","authors":["Wei Zhou","Hong Huang","Ruize Shi","Bang Liu"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05537v1.pdf","html_url":"https://arxiv.org/abs/2601.05537v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05529v1","updated":"2026-01-09T05:04:15Z","published":"2026-01-09T05:04:15Z","title":"Safety Not Found (404): Hidden Risks of LLM-Based Robotics Decision Making","authors":["Jua Han","Jaeyoon Seo","Jungbin Min","Jean Oh","Jihie Kim"],"primary_category":"cs.AI","categories":["cs.AI","cs.RO"],"pdf_url":"https://arxiv.org/pdf/2601.05529v1.pdf","html_url":"https://arxiv.org/abs/2601.05529v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05527v1","updated":"2026-01-09T04:54:56Z","published":"2026-01-09T04:54:56Z","title":"DeMa: Dual-Path Delay-Aware Mamba for Efficient Multivariate Time Series Analysis","authors":["Rui An","Haohao Qu","Wenqi Fan","Xuequn Shang","Qing Li"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05527v1.pdf","html_url":"https://arxiv.org/abs/2601.05527v1","comment":"Under review"}
{"id":"http://arxiv.org/abs/2601.05525v1","updated":"2026-01-09T04:43:21Z","published":"2026-01-09T04:43:21Z","title":"Explainable AI: Learning from the Learners","authors":["Ricardo Vinuesa","Steven L. Brunton","Gianmarco Mengaldo"],"primary_category":"cs.AI","categories":["cs.AI","cs.LG","physics.comp-ph","physics.soc-ph"],"pdf_url":"https://arxiv.org/pdf/2601.05525v1.pdf","html_url":"https://arxiv.org/abs/2601.05525v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05503v1","updated":"2026-01-09T03:24:46Z","published":"2026-01-09T03:24:46Z","title":"Over-Searching in Search-Augmented Large Language Models","authors":["Roy Xie","Deepak Gopinath","David Qiu","Dong Lin","Haitian Sun","Saloni Potdar","Bhuwan Dhingra"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05503v1.pdf","html_url":"https://arxiv.org/abs/2601.05503v1","comment":"Accepted to EACL 2026 Main Conference"}
{"id":"http://arxiv.org/abs/2601.05502v1","updated":"2026-01-09T03:21:49Z","published":"2026-01-09T03:21:49Z","title":"Evaluating the Use of LLMs for Automated DOM-Level Resolution of Web Performance Issues","authors":["Gideon Peters","SayedHassan Khatoonabadi","Emad Shihab"],"primary_category":"cs.SE","categories":["cs.SE","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05502v1.pdf","html_url":"https://arxiv.org/abs/2601.05502v1","comment":"Accepted to the The ACM International Conference on Mining Software Repositories (MSR) (MSR 2026)"}
{"id":"http://arxiv.org/abs/2601.05500v1","updated":"2026-01-09T03:19:37Z","published":"2026-01-09T03:19:37Z","title":"The Evaluation Gap in Medicine, AI and LLMs: Navigating Elusive Ground Truth & Uncertainty via a Probabilistic Paradigm","authors":["Aparna Elangovan","Lei Xu","Mahsa Elyasi","Ismail Akdulum","Mehmet Aksakal","Enes Gurun","Brian Hur","Saab Mansour","Ravid Shwartz Ziv","Karin Verspoor","Dan Roth"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05500v1.pdf","html_url":"https://arxiv.org/abs/2601.05500v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05498v1","updated":"2026-01-09T03:02:41Z","published":"2026-01-09T03:02:41Z","title":"Prompt-Free SAM-Based Multi-Task Framework for Breast Ultrasound Lesion Segmentation and Classification","authors":["Samuel E. Johnny","Bernes L. Atabonfack","Israel Alagbe","Assane Gueye"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05498v1.pdf","html_url":"https://arxiv.org/abs/2601.05498v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05483v1","updated":"2026-01-09T02:34:35Z","published":"2026-01-09T02:34:35Z","title":"MMUEChange: A Generalized LLM Agent Framework for Intelligent Multi-Modal Urban Environment Change Analysis","authors":["Zixuan Xiao","Jun Ma","Siwei Zhang"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05483v1.pdf","html_url":"https://arxiv.org/abs/2601.05483v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05474v1","updated":"2026-01-09T02:18:59Z","published":"2026-01-09T02:18:59Z","title":"Efficient Differentiable Causal Discovery via Reliable Super-Structure Learning","authors":["Pingchuan Ma","Qixin Zhang","Shuai Wang","Dacheng Tao"],"primary_category":"cs.LG","categories":["cs.LG","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05474v1.pdf","html_url":"https://arxiv.org/abs/2601.05474v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05467v1","updated":"2026-01-09T01:49:41Z","published":"2026-01-09T01:49:41Z","title":"STELP: Secure Transpilation and Execution of LLM-Generated Programs","authors":["Swapnil Shinde","Sahil Wadhwa","Andy Luo","Emily Chen"],"primary_category":"cs.SE","categories":["cs.SE","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05467v1.pdf","html_url":"https://arxiv.org/abs/2601.05467v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05466v1","updated":"2026-01-09T01:41:39Z","published":"2026-01-09T01:41:39Z","title":"Jailbreaking Large Language Models through Iterative Tool-Disguised Attacks via Reinforcement Learning","authors":["Zhaoqi Wang","Zijian Zhang","Daqing He","Pengtao Kou","Xin Li","Jiamou Liu","Jincheng An","Yong Liu"],"primary_category":"cs.CR","categories":["cs.CR","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05466v1.pdf","html_url":"https://arxiv.org/abs/2601.05466v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05465v1","updated":"2026-01-09T01:38:38Z","published":"2026-01-09T01:38:38Z","title":"PRISMA: Reinforcement Learning Guided Two-Stage Policy Optimization in Multi-Agent Architecture for Open-Domain Multi-Hop Question Answering","authors":["Yu Liu","Wenxiao Zhang","Cong Cao","Wenxuan Lu","Fangfang Yuan","Diandian Guo","Kun Peng","Qiang Sun","Kaiyan Zhang","Yanbing Liu","Jin B. Hong","Bowen Zhou","Zhiyuan Ma"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05465v1.pdf","html_url":"https://arxiv.org/abs/2601.05465v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05459v1","updated":"2026-01-09T01:17:31Z","published":"2026-01-09T01:17:31Z","title":"Do LLMs Need Inherent Reasoning Before Reinforcement Learning? A Study in Korean Self-Correction","authors":["Hongjin Kim","Jaewook Lee","Kiyoung Lee","Jong-hun Shin","Soojong Lim","Oh-Woog Kwon"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05459v1.pdf","html_url":"https://arxiv.org/abs/2601.05459v1","comment":"IJCNLP-AACL 2025 (Main), Outstanding Paper Award"}
{"id":"http://arxiv.org/abs/2601.05455v1","updated":"2026-01-09T01:01:55Z","published":"2026-01-09T01:01:55Z","title":"ART: Adaptive Reasoning Trees for Explainable Claim Verification","authors":["Sahil Wadhwa","Himanshu Kumar","Guanqun Yang","Abbaas Alif Mohamed Nishar","Pranab Mohanty","Swapnil Shinde","Yue Wu"],"primary_category":"cs.AI","categories":["cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05455v1.pdf","html_url":"https://arxiv.org/abs/2601.05455v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05437v1","updated":"2026-01-09T00:09:28Z","published":"2026-01-09T00:09:28Z","title":"Tracing Moral Foundations in Large Language Models","authors":["Chenxiao Yu","Bowen Yi","Farzan Karimi-Malekabadi","Suhaib Abdurahman","Jinyi Ye","Shrikanth Narayanan","Yue Zhao","Morteza Dehghani"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05437v1.pdf","html_url":"https://arxiv.org/abs/2601.05437v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05432v1","updated":"2026-01-08T23:47:30Z","published":"2026-01-08T23:47:30Z","title":"Thinking with Map: Reinforced Parallel Map-Augmented Agent for Geolocalization","authors":["Yuxiang Ji","Yong Wang","Ziyu Ma","Yiming Hu","Hailang Huang","Xuecai Hu","Guanhua Chen","Liaoni Wu","Xiangxiang Chu"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI","cs.CL"],"pdf_url":"https://arxiv.org/pdf/2601.05432v1.pdf","html_url":"https://arxiv.org/abs/2601.05432v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05399v1","updated":"2026-01-08T21:44:00Z","published":"2026-01-08T21:44:00Z","title":"Multi-task Cross-modal Learning for Chest X-ray Image Retrieval","authors":["Zhaohui Liang","Sivaramakrishnan Rajaraman","Niccolo Marini","Zhiyun Xue","Sameer Antani"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI","cs.IR"],"pdf_url":"https://arxiv.org/pdf/2601.05399v1.pdf","html_url":"https://arxiv.org/abs/2601.05399v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05386v1","updated":"2026-01-08T21:18:45Z","published":"2026-01-08T21:18:45Z","title":"On the Effect of Cheating in Chess","authors":["Daniel Keren"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05386v1.pdf","html_url":"https://arxiv.org/abs/2601.05386v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05384v1","updated":"2026-01-08T21:16:28Z","published":"2026-01-08T21:16:28Z","title":"Conformity and Social Impact on AI Agents","authors":["Alessandro Bellina","Giordano De Marzo","David Garcia"],"primary_category":"cs.AI","categories":["cs.AI","cs.CL","cs.CY"],"pdf_url":"https://arxiv.org/pdf/2601.05384v1.pdf","html_url":"https://arxiv.org/abs/2601.05384v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05376v1","updated":"2026-01-08T21:01:11Z","published":"2026-01-08T21:01:11Z","title":"The Persona Paradox: Medical Personas as Behavioral Priors in Clinical Language Models","authors":["Tassallah Abdullahi","Shrestha Ghosh","Hamish S Fraser","Daniel León Tramontini","Adeel Abbasi","Ghada Bourjeily","Carsten Eickhoff","Ritambhara Singh"],"primary_category":"cs.AI","categories":["cs.AI","cs.CL"],"pdf_url":"https://arxiv.org/pdf/2601.05376v1.pdf","html_url":"https://arxiv.org/abs/2601.05376v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05373v1","updated":"2026-01-08T20:54:24Z","published":"2026-01-08T20:54:24Z","title":"Ensemble of radiomics and ConvNeXt for breast cancer diagnosis","authors":["Jorge Alberto Garza-Abdala","Gerardo Alejandro Fumagal-González","Beatriz A. Bosques-Palomo","Mario Alexis Monsivais Molina","Daly Avedano","Servando Cardona-Huerta","José Gerardo Tamez-Pena"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05373v1.pdf","html_url":"https://arxiv.org/abs/2601.05373v1","comment":"Accepted and presented at the IEEE International Symposium on Computer-Based Medical Systems (CBMS) 2025"}
{"id":"http://arxiv.org/abs/2601.05366v1","updated":"2026-01-08T20:44:28Z","published":"2026-01-08T20:44:28Z","title":"Lost in Execution: On the Multilingual Robustness of Tool Calling in Large Language Models","authors":["Zheng Luo","T Pranav Kutralingam","Ogochukwu N Okoani","Wanpeng Xu","Hua Wei","Xiyang Hu"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05366v1.pdf","html_url":"https://arxiv.org/abs/2601.05366v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05364v1","updated":"2026-01-08T20:39:50Z","published":"2026-01-08T20:39:50Z","title":"STResNet & STYOLO : A New Family of Compact Classification and Object Detection Models for MCUs","authors":["Sudhakar Sah","Ravish Kumar"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05364v1.pdf","html_url":"https://arxiv.org/abs/2601.05364v1","comment":"9 pages, 1 figure"}
{"id":"http://arxiv.org/abs/2601.05356v1","updated":"2026-01-08T20:15:28Z","published":"2026-01-08T20:15:28Z","title":"PRISM: Protocol Refinement through Intelligent Simulation Modeling","authors":["Brian Hsu","Priyanka V Setty","Rory M Butler","Ryan Lewis","Casey Stone","Rebecca Weinberg","Thomas Brettin","Rick Stevens","Ian Foster","Arvind Ramanathan"],"primary_category":"cs.RO","categories":["cs.RO","cs.AI","cs.MA","q-bio.QM"],"pdf_url":"https://arxiv.org/pdf/2601.05356v1.pdf","html_url":"https://arxiv.org/abs/2601.05356v1","comment":"43 pages, 8 figures, submitted to RSC Digital Discovery. Equal contribution: B. Hsu, P.V. Setty, R.M. Butler. Corresponding author: A. Ramanathan"}
{"id":"http://arxiv.org/abs/2601.05355v1","updated":"2026-01-08T20:14:30Z","published":"2026-01-08T20:14:30Z","title":"A Bayesian Generative Modeling Approach for Arbitrary Conditional Inference","authors":["Qiao Liu","Wing Hung Wong"],"primary_category":"stat.ML","categories":["stat.ML","cs.AI","cs.LG","stat.CO","stat.ME"],"pdf_url":"https://arxiv.org/pdf/2601.05355v1.pdf","html_url":"https://arxiv.org/abs/2601.05355v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05339v1","updated":"2026-01-08T19:37:22Z","published":"2026-01-08T19:37:22Z","title":"Multi-turn Jailbreaking Attack in Multi-Modal Large Language Models","authors":["Badhan Chandra Das","Md Tasnim Jawad","Joaquin Molto","M. Hadi Amini","Yanzhao Wu"],"primary_category":"cs.CR","categories":["cs.CR","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05339v1.pdf","html_url":"https://arxiv.org/abs/2601.05339v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05330v1","updated":"2026-01-08T19:17:18Z","published":"2026-01-08T19:17:18Z","title":"Improving Enzyme Prediction with Chemical Reaction Equations by Hypergraph-Enhanced Knowledge Graph Embeddings","authors":["Tengwei Song","Long Yin","Zhen Han","Zhiqiang Xu"],"primary_category":"cs.AI","categories":["cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05330v1.pdf","html_url":"https://arxiv.org/abs/2601.05330v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05328v1","updated":"2026-01-08T19:11:55Z","published":"2026-01-08T19:11:55Z","title":"Bi-Orthogonal Factor Decomposition for Vision Transformers","authors":["Fenil R. Doshi","Thomas Fel","Talia Konkle","George Alvarez"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI"],"pdf_url":"https://arxiv.org/pdf/2601.05328v1.pdf","html_url":"https://arxiv.org/abs/2601.05328v1","comment":null}
{"id":"http://arxiv.org/abs/2601.05242v1","updated":"2026-01-08T18:59:24Z","published":"2026-01-08T18:59:24Z","title":"GDPO: Group reward-Decoupled Normalization Policy Optimization for Multi-reward RL Optimization","authors":["Shih-Yang Liu","Xin Dong","Ximing Lu","Shizhe Diao","Peter Belcak","Mingjie Liu","Min-Hung Chen","Hongxu Yin","Yu-Chiang Frank Wang","Kwang-Ting Cheng","Yejin Choi","Jan Kautz","Pavlo Molchanov"],"primary_category":"cs.CL","categories":["cs.CL","cs.AI","cs.LG"],"pdf_url":"https://arxiv.org/pdf/2601.05242v1.pdf","html_url":"https://arxiv.org/abs/2601.05242v1","comment":"NVIDIA-Tech Report"}
{"id":"http://arxiv.org/abs/2601.05241v1","updated":"2026-01-08T18:59:22Z","published":"2026-01-08T18:59:22Z","title":"RoboVIP: Multi-View Video Generation with Visual Identity Prompting Augments Robot Manipulation","authors":["Boyang Wang","Haoran Zhang","Shujie Zhang","Jinkun Hao","Mingda Jia","Qi Lv","Yucheng Mao","Zhaoyang Lyu","Jia Zeng","Xudong Xu","Jiangmiao Pang"],"primary_category":"cs.CV","categories":["cs.CV","cs.AI","cs.RO"],"pdf_url":"https://arxiv.org/pdf/2601.05241v1.pdf","html_url":"https://arxiv.org/abs/2601.05241v1","comment":null}
@@ -0,0 +1,27 @@
[
{
"question": "How does parallel processing in neural network hardware accelerators affect the effectiveness of correlation-based side-channel power analysis attacks?",
"answer": "Parallel processing in hardware accelerators causes multiple neurons in the same fully-connected layer to perform multiply-and-accumulate operations simultaneously on the same input, which aggregates their power consumption. This concurrency reduces the observable correlation between individual operations and the measured power trace, thereby decreasing the success rate of correlation power analysis attacks. The study derives equations showing that correlation decreases as the level of parallelism increases, and validates these findings using an FPGA-based vector-multiplication unit.",
"file": "texts/Influence of Parallelism in Vector-Multiplication Units on Correlation Power Analysis.txt"
},
{
"question": "How does CHDP address the challenges of learning and optimizing policies in hybrid discrete-continuous action spaces?",
"answer": "CHDP frames the hybrid action space problem as a fully cooperative game between two agents: one using a discrete diffusion policy and the other using a continuous diffusion policy. The continuous policy is conditioned on the discrete action representation to explicitly model their dependency. To avoid update conflicts, CHDP applies a sequential update scheme that promotes co-adaptation. For scalability in high-dimensional discrete spaces, it introduces a codebook that embeds discrete actions into a low-dimensional latent space, along with a Q-function-based guidance mechanism to align the embeddings with the discrete policy. This design improves expressiveness, scalability, and performance, outperforming prior methods by up to 19.3% in success rate.",
"file": "texts/CHDP: Cooperative Hybrid Diffusion Policies for Reinforcement Learning in Parameterized Action Space.txt"
},
{
"question": "What impact do medical personas and interaction styles have on the safety and performance of large language models in clinical decision-making?",
"answer": "Medical personas act as behavioral priors that produce context-dependent and non-monotonic effects in clinical LLMs. They improve accuracy and calibration in critical care tasks by up to about 20%, but can significantly degrade performance in primary-care settings. Interaction styles influence risk-taking and sensitivity, though their effects vary by model. While automated evaluations tend to favor medical personas in safety-critical scenarios, human clinicians show only moderate agreement on safety compliance and report low confidence in reasoning quality, indicating that personas introduce trade-offs rather than guaranteed safety or expertise.",
"file": "texts/The Persona Paradox: Medical Personas as Behavioral Priors in Clinical Language Models.txt"
},
{
"question": "How effective is a Vision Transformer-based deep learning model for segmenting pancreatic tumors in endoscopic ultrasound images?",
"answer": "The Vision Transformer-based segmentation model demonstrates strong and consistent performance on public EUS datasets. Trained and validated on over 17,000 images, it achieved mean Dice similarity coefficients around 0.65, high specificity (over 97%), and accuracy above 97% in both cross-validation and external testing. While results indicate robust tumor segmentation capability, challenges such as dataset heterogeneity, occasional erroneous multiple predictions, and limited external validation suggest that further refinement and prospective evaluation are needed before clinical deployment.",
"file": "texts/Performance of a Deep Learning-Based Segmentation Model for Pancreatic Tumors on Public Endoscopic Ultrasound Datasets.txt"
},
{
"question": "How does the WildSci dataset advance large language model reasoning in scientific domains with limited structured data?",
"answer": "WildSci advances scientific reasoning by providing a large, automatically synthesized dataset of multiple-choice science questions derived from peer-reviewed literature across nine disciplines and 26 subdomains. By converting complex, open-ended scientific reasoning tasks into a multiple-choice format, it enables scalable training with clear reward signals. Reinforcement learning is then used to finetune models on this data, leading to improved domain-specific performance, better generalization, and more informative training dynamics on a range of scientific benchmarks.",
"file": "texts/WildSci: Advancing Scientific Reasoning from In-the-Wild Literature.txt"
}
]
+60
View File
@@ -0,0 +1,60 @@
# Evaluation Results Report
## Summary
Total tasks evaluated: **5**
---
## Time Performance
Average execution time across all tasks:
| Framework | Average Time (seconds) | Status |
|-----------|------------------------|--------|
| Agentic File Search with FileSystem Tools (fs-explorer) | 11.81s | |
| **Traditional RAG** | 9.91s | **Faster** |
**Winner:** Traditional RAG (1.90s faster)
---
## LLM Evaluation Metrics
### Correctness Scores
| Framework | Average Score | Status |
|-----------|---------------|--------|
| Agentic File Search with FileSystem Tools (fs-explorer) | 9.00 | |
| **Traditional RAG** | 9.60 | **Higher** |
**Winner:** Traditional RAG (+0.60 points)
### Relevance Scores
| Framework | Average Score | Status |
|-----------|---------------|--------|
| Agentic File Search with FileSystem Tools (fs-explorer) | 10.00 | |
| **Traditional RAG** | 10.00 | **Higher** |
**Winner:** Traditional RAG (+0.00 points)
---
## Overall Comparison
| Metric | FS-Explorer | RAG | Winner |
|--------|-------------|-----|--------|
| **Speed** | 11.81s | 9.91s | Traditional RAG |
| **Correctness** | 9.00 | 9.60 | Traditional RAG |
| **Relevance** | 10.00 | 10.00 | Traditional RAG |
---
## Key Takeaways
- **Fastest Framework:** Traditional RAG is 1.90s faster on average
- **Most Correct:** Traditional RAG produces more correct answers
- **Most Relevant:** Traditional RAG produces more relevant answers
**Overall Winner:** RAG (3/3 metrics)
@@ -0,0 +1,200 @@
[
{
"task": {
"question": "How does parallel processing in neural network hardware accelerators affect the effectiveness of correlation-based side-channel power analysis attacks?",
"answer": "Parallel processing in hardware accelerators causes multiple neurons in the same fully-connected layer to perform multiply-and-accumulate operations simultaneously on the same input, which aggregates their power consumption. This concurrency reduces the observable correlation between individual operations and the measured power trace, thereby decreasing the success rate of correlation power analysis attacks. The study derives equations showing that correlation decreases as the level of parallelism increases, and validates these findings using an FPGA-based vector-multiplication unit.",
"file": "texts/Influence of Parallelism in Vector-Multiplication Units on Correlation Power Analysis.txt"
},
"tool_calls": [
"read",
"glob",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 9,
"reason": "The provided answer captures the core mechanism and conclusion from the ground truth: simultaneous MAC operations aggregate power, reducing observable correlation and lowering CPA success as parallelism increases. It omits secondary details (derivation of equations and FPGA validation), so it\u2019s slightly less complete, but still correct on the main point."
},
"rag": {
"relevance": 10,
"correctness": 10,
"reason": "The proposed answer matches the ground truth closely: it explains that parallel MAC operations from multiple neurons on the same input aggregate/overlap power, reducing per-operation correlation and CPA success. It also mentions the derived equations showing correlation decreases with increased parallelism and the FPGA vector-multiplication validation. No material omissions or inaccuracies relative to the ground truth."
}
},
"used_files": {
"fs_explorer": [],
"rag": "/Users/clee/Desktop/code-projects/fs-explorer/data/benchmark/arxiv-100-papers/texts/Influence of Parallelism in Vector-Multiplication Units on Correlation Power Analysis.txt"
},
"has_error": {
"fs_explorer": false,
"rag": false
},
"time_taken": {
"fs_explorer": 11.672821044921875,
"rag": 9.38922119140625
},
"answers": {
"fs_explorer": "Parallel processing in neural network hardware accelerators, specifically when neurons in the same fully-connected layer simultaneously process the same input value via concurrent multiply-and-accumulate operations, reduces the effectiveness of correlation-based side-channel power analysis attacks. This increase in parallelism causes the correlation between the predicted and actual power consumption to decrease, thereby lowering the attack's success rate.",
"rag": "Parallel processing in neural network hardware accelerators can reduce the effectiveness of correlation-based side-channel power analysis attacks. This is because, when multiple neurons in the same fully-connected layer process the same input in parallel (such as through concurrent multiply-and-accumulate operations), the power consumption patterns from each operation overlap and mix together. This overlapping makes it more difficult for an attacker to correlate the observed power traces with the values being processed by any individual neuron, thereby decreasing the signal-to-noise ratio of the leaked information.\n\nAccording to the context, as the level of parallelism increases, the correlation between the observed power traces and specific operations performed by the hardware diminishes. The paper even derives equations that describe how this correlation drops with higher degrees of parallel processing, effectively reducing the success rate of correlation power analysis attacks. These findings were also validated experimentally using an FPGA implementation of a vector-multiplication unit, confirming that increased parallelism in hardware accelerators can offer a form of intrinsic resistance against this type of side-channel attack."
}
},
{
"task": {
"question": "How does CHDP address the challenges of learning and optimizing policies in hybrid discrete-continuous action spaces?",
"answer": "CHDP frames the hybrid action space problem as a fully cooperative game between two agents: one using a discrete diffusion policy and the other using a continuous diffusion policy. The continuous policy is conditioned on the discrete action representation to explicitly model their dependency. To avoid update conflicts, CHDP applies a sequential update scheme that promotes co-adaptation. For scalability in high-dimensional discrete spaces, it introduces a codebook that embeds discrete actions into a low-dimensional latent space, along with a Q-function-based guidance mechanism to align the embeddings with the discrete policy. This design improves expressiveness, scalability, and performance, outperforming prior methods by up to 19.3% in success rate.",
"file": "texts/CHDP: Cooperative Hybrid Diffusion Policies for Reinforcement Learning in Parameterized Action Space.txt"
},
"tool_calls": [
"read",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 9,
"reason": "The proposed answer matches the ground truth\u2019s key points: cooperative game formulation with discrete and continuous diffusion policies, conditioning of the continuous policy on the discrete representation, sequential updates to avoid conflicts and encourage co-adaptation, and the codebook + Q-guidance for scalability and alignment. The only notable omission is the quantitative performance claim (e.g., up to 19.3% success-rate improvement), so it\u2019s highly correct but not perfectly complete."
},
"rag": {
"relevance": 10,
"correctness": 10,
"reason": "The provided answer matches the ground truth on all key points: cooperative two-agent framing, discrete/continuous diffusion policies, conditioning continuous on discrete representation, sequential updates to avoid conflicts, codebook-based low-dimensional embedding for scalable discrete actions, and Q-guidance to align embeddings. It omits the specific numeric improvement (up to 19.3%), but this is a minor detail; overall it is fully relevant and technically accurate."
}
},
"used_files": {
"fs_explorer": [],
"rag": "/Users/clee/Desktop/code-projects/fs-explorer/data/benchmark/arxiv-100-papers/texts/CHDP: Cooperative Hybrid Diffusion Policies for Reinforcement Learning in Parameterized Action Space.txt"
},
"has_error": {
"fs_explorer": false,
"rag": false
},
"time_taken": {
"fs_explorer": 11.654190063476562,
"rag": 11.9148268699646
},
"answers": {
"fs_explorer": "CHDP addresses the challenges of learning and optimizing policies in hybrid discrete-continuous action spaces by treating the problem as a fully cooperative game. It employs two cooperative agents\u2014one using a discrete diffusion policy and the other a continuous diffusion policy\u2014where the continuous policy is conditioned on the discrete action's representation to model dependencies. To address specific challenges, CHDP uses: 1) A sequential update scheme to mitigate conflicts during simultaneous updates and foster co-adaptation; 2) A codebook to embed high-dimensional discrete action spaces into a low-dimensional latent space for better scalability; and 3) A Q-function-based guidance mechanism to align the codebook's embeddings with the discrete policy's representation during training.",
"rag": "CHDP addresses the challenges of learning and optimizing policies in hybrid discrete-continuous action spaces through several key mechanisms:\n\n1. **Cooperative Agents**: It employs two cooperative agents\u2014one responsible for discrete actions and the other for continuous parameters. This separation allows each agent to specialize and leverage expressiveness appropriate to its subspace.\n\n2. **Dependency Modeling**: The continuous policy is conditioned on the representation of the selected discrete action, enabling the model to capture and explicitly represent dependencies between discrete and continuous choices.\n\n3. **Diffusion Policies**: Both policies utilize diffusion models, which are well-suited for capturing complex, multimodal distributions, increasing overall policy expressiveness.\n\n4. **Sequential Policy Updates**: By updating discrete and continuous policies sequentially rather than simultaneously, CHDP mitigates update conflicts and encourages co-adaptation, improving stability and learning efficiency.\n\n5. **Latent Codebook for Discrete Actions**: To improve scalability in high-dimensional discrete spaces, CHDP embeds discrete actions into a compact, low-dimensional latent space via a codebook, making learning more efficient and structured.\n\n6. **Q-Function-Based Guidance**: A Q-function aligns the codebook embeddings with the representations learned by the discrete policy, ensuring the latent space remains meaningful and useful during training.\n\nThese innovations collectively help CHDP overcome traditional limitations in expressiveness and scalability, resulting in superior performance in challenging hybrid action benchmarks."
}
},
{
"task": {
"question": "What impact do medical personas and interaction styles have on the safety and performance of large language models in clinical decision-making?",
"answer": "Medical personas act as behavioral priors that produce context-dependent and non-monotonic effects in clinical LLMs. They improve accuracy and calibration in critical care tasks by up to about 20%, but can significantly degrade performance in primary-care settings. Interaction styles influence risk-taking and sensitivity, though their effects vary by model. While automated evaluations tend to favor medical personas in safety-critical scenarios, human clinicians show only moderate agreement on safety compliance and report low confidence in reasoning quality, indicating that personas introduce trade-offs rather than guaranteed safety or expertise.",
"file": "texts/The Persona Paradox: Medical Personas as Behavioral Priors in Clinical Language Models.txt"
},
"tool_calls": [
"read",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 9,
"reason": "The proposed answer closely matches the ground truth: it identifies personas/interaction styles as behavioral priors with context-dependent, non-monotonic effects; notes ~20% gains in accuracy/calibration in critical care and degradation in primary care; and describes interaction-style effects on risk-taking varying by model. It also captures the discrepancy between automated evaluations and clinician judgments, including low confidence in reasoning quality. Minor omission: it doesn\u2019t explicitly mention only \u201cmoderate agreement\u201d among clinicians on safety compliance, but the overall substance is accurate."
},
"rag": {
"relevance": 10,
"correctness": 10,
"reason": "The answer closely matches the ground truth: it describes personas as producing complex, context-dependent/non-monotonic effects; notes up to ~20% improvements in accuracy/calibration in critical care; highlights degraded performance in primary care; explains interaction-style effects on risk-taking/sensitivity varying by model; and contrasts automated evaluations favoring personas with only moderate clinician agreement and low confidence in reasoning quality. No substantive omissions or contradictions."
}
},
"used_files": {
"fs_explorer": [],
"rag": "/Users/clee/Desktop/code-projects/fs-explorer/data/benchmark/arxiv-100-papers/texts/The Persona Paradox: Medical Personas as Behavioral Priors in Clinical Language Models.txt"
},
"has_error": {
"fs_explorer": false,
"rag": false
},
"time_taken": {
"fs_explorer": 9.562525987625122,
"rag": 7.819545745849609
},
"answers": {
"fs_explorer": "Medical personas and interaction styles function as behavioral priors for LLMs in clinical decision-making, but they introduce context-dependent trade-offs rather than consistent improvements. Medical personas (e.g., ED physician, nurse) improve performance in critical care tasks, with accuracy and calibration gains of up to ~+20%, yet they degrade performance in primary-care settings by similar margins. Interaction styles (bold vs. cautious) modulate a model's risk propensity and sensitivity, though this effect varies significantly between different models. Overall, while LLM-based evaluations favor medical personas in safety-critical cases, human clinicians report low confidence in the reasoning quality provided, indicating that personas do not provide a monotonic guarantee of safety or expertise.",
"rag": "Medical personas and interaction styles have significant, but complex, impacts on the safety and performance of large language models (LLMs) in clinical decision-making. According to the provided context, assigning medical personas (such as Emergency Department physician or nurse) to LLMs generally improves their accuracy and calibration in safety-critical, high-stakes clinical tasks\u2014sometimes by as much as 20%. This suggests that personas can enhance the model's performance where clinical expertise is crucial, such as critical care scenarios.\n\nHowever, these benefits are not uniform or monotonic. In primary-care settings, medical personas can actually degrade LLM performance by comparable margins, indicating a context-dependent effect. Interaction styles (bold versus cautious communication) further influence the model's risk behavior and sensitivity, but the impact of these styles varies significantly depending on the specific LLM used.\n\nFinally, while automated LLM-judge rankings tend to prefer medical personas in safety-critical tasks, human clinical evaluators show only moderate agreement on whether the responses are safe, and they express low confidence in most assessments of reasoning quality. This demonstrates that persona conditioning introduces context-dependent trade-offs: while it can guide LLMs toward more expert-like and safer responses in some settings, it does not offer a universal guarantee of improved safety or expertise across all clinical scenarios."
}
},
{
"task": {
"question": "How effective is a Vision Transformer-based deep learning model for segmenting pancreatic tumors in endoscopic ultrasound images?",
"answer": "The Vision Transformer-based segmentation model demonstrates strong and consistent performance on public EUS datasets. Trained and validated on over 17,000 images, it achieved mean Dice similarity coefficients around 0.65, high specificity (over 97%), and accuracy above 97% in both cross-validation and external testing. While results indicate robust tumor segmentation capability, challenges such as dataset heterogeneity, occasional erroneous multiple predictions, and limited external validation suggest that further refinement and prospective evaluation are needed before clinical deployment.",
"file": "texts/Performance of a Deep Learning-Based Segmentation Model for Pancreatic Tumors on Public Endoscopic Ultrasound Datasets.txt"
},
"tool_calls": [
"read",
"glob",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 9,
"reason": "The proposed answer closely matches the ground truth: it reports similar mean Dice (~0.65), high specificity (>97%), and accuracy (>97%) across cross-validation and external testing, and it notes key limitations like dataset heterogeneity and erroneous multiple predictions (~9.7%). It is highly relevant and largely correct. The only minor gap is that it doesn\u2019t explicitly mention the limited extent of external validation and the need for prospective evaluation before clinical deployment (though it implies refinement is needed)."
},
"rag": {
"relevance": 10,
"correctness": 9,
"reason": "The evaluated answer closely matches the ground truth: it reports strong performance (DSC ~0.65), very high specificity (>97%), high accuracy (~97%), mentions large-scale training/validation and external testing, and notes key limitations (heterogeneity, limited external validation, need for further refinement). Minor discrepancies/extra details (e.g., specific sensitivity/IoU numbers and 9.7% error rate not explicitly in the ground truth) prevent a perfect correctness score, but they do not contradict the ground truth."
}
},
"used_files": {
"fs_explorer": [],
"rag": "/Users/clee/Desktop/code-projects/fs-explorer/data/benchmark/arxiv-100-papers/texts/Performance of a Deep Learning-Based Segmentation Model for Pancreatic Tumors on Public Endoscopic Ultrasound Datasets.txt"
},
"has_error": {
"fs_explorer": false,
"rag": false
},
"time_taken": {
"fs_explorer": 12.818586826324463,
"rag": 9.44664978981018
},
"answers": {
"fs_explorer": "The Vision Transformer-based deep learning model (using the USFM framework) demonstrated strong performance for segmenting pancreatic tumors in endoscopic ultrasound images. In 5-fold cross-validation, it achieved a mean Dice similarity coefficient (DSC) of 0.651, a sensitivity of 69.8%, a specificity of 98.8%, and an accuracy of 97.5%. On an external validation set, it achieved a DSC of 0.657 and an IoU of 0.614. Despite these strong results, 9.7% of cases showed erroneous multiple predictions, and dataset heterogeneity remains a challenge for refinement.",
"rag": "A Vision Transformer-based deep learning model is quite effective for segmenting pancreatic tumors in endoscopic ultrasound (EUS) images, according to the provided study. The model, trained on a large dataset of EUS images and validated externally, achieved solid performance metrics: Dice similarity coefficients (DSC) of 0.651 in cross-validation and 0.657 in external testing, with intersection over union (IoU) values around 0.58 to 0.61. Sensitivity ranged from about 69.8% to 71.8%, and specificity was very high (over 97%). Accuracy was also high at 97.5% in cross-validation. However, the study notes some limitations, including minor errors in about 9.7% of cases and the need for further validation due to dataset heterogeneity and limited external testing. Overall, the Vision Transformer-based approach shows strong potential but requires more refinement and broader testing for reliable clinical deployment."
}
},
{
"task": {
"question": "How does the WildSci dataset advance large language model reasoning in scientific domains with limited structured data?",
"answer": "WildSci advances scientific reasoning by providing a large, automatically synthesized dataset of multiple-choice science questions derived from peer-reviewed literature across nine disciplines and 26 subdomains. By converting complex, open-ended scientific reasoning tasks into a multiple-choice format, it enables scalable training with clear reward signals. Reinforcement learning is then used to finetune models on this data, leading to improved domain-specific performance, better generalization, and more informative training dynamics on a range of scientific benchmarks.",
"file": "texts/WildSci: Advancing Scientific Reasoning from In-the-Wild Literature.txt"
},
"tool_calls": [
"read",
"glob",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 9,
"reason": "The answer closely matches the ground truth: it mentions automatic synthesis from peer-reviewed literature, coverage across 9 disciplines/26 subdomains, conversion to multiple-choice for scalable training with clear reward signals, and use of reinforcement learning to improve performance. Minor omissions include explicitly noting improved generalization, more informative training dynamics, and improvements across a range of scientific benchmarks, but overall it is accurate and highly relevant."
},
"rag": {
"relevance": 10,
"correctness": 9,
"reason": "The answer closely matches the ground truth: it notes WildSci is automatically synthesized from peer-reviewed literature across 9 disciplines/26 subdomains, converts open-ended reasoning into multiple-choice for scalable training with clear reward signals, and enables reinforcement learning finetuning, leading to improved performance and generalization. It adds an extra point about benchmarking/evaluation (consistent with the ground truth\u2019s mention of scientific benchmarks and training dynamics). Minor deduction because it is slightly less explicit about \u201cmore informative training dynamics\u201d and \u201cimproved domain-specific performance\u201d as concrete outcomes, though these are implied."
}
},
"used_files": {
"fs_explorer": [],
"rag": "/Users/clee/Desktop/code-projects/fs-explorer/data/benchmark/arxiv-100-papers/texts/WildSci: Advancing Scientific Reasoning from In-the-Wild Literature.txt"
},
"has_error": {
"fs_explorer": false,
"rag": false
},
"time_taken": {
"fs_explorer": 13.33497428894043,
"rag": 10.957519054412842
},
"answers": {
"fs_explorer": "The WildSci dataset advances large language model (LLM) reasoning in scientific domains with limited structured data by automatically synthesizing domain-specific science questions directly from peer-reviewed literature. It covers 9 disciplines and 26 subdomains, addressing the lack of coverage and structured data in fields like medicine and materials science. By framing these complex reasoning tasks in a multiple-choice format, the dataset enables scalable training and reinforcement learning through the use of well-defined reward signals, allowing models to improve their performance on open-ended scientific questions.",
"rag": "The WildSci dataset advances large language model (LLM) reasoning in scientific domains with limited structured data through several key innovations based on the context provided:\n\n1. **Broader Coverage of Scientific Domains:** WildSci is synthesized from peer-reviewed literature and spans 9 scientific disciplines and 26 subdomains, addressing the limited dataset coverage that previously constrained LLM progress in areas like medicine and materials science.\n\n2. **Multiple-Choice Format for Scalability:** By framing complex, open-ended scientific questions in a multiple-choice format, WildSci makes it possible to train LLMs using well-defined reward signals and objective evaluation metrics, which previously were lacking in these fields.\n\n3. **Enabling Reinforcement Learning:** The structured format allows the application of reinforcement learning techniques (such as finetuning on these data), which have driven recent breakthroughs in reasoning within domains like math and coding, but were harder to apply in the absence of structured scientific datasets.\n\n4. **Improved Evaluation and Benchmarking:** WildSci's coverage and format permit the creation and use of robust scientific reasoning benchmarks, enabling better analysis of LLM performance, domain-specific behavior, and generalization capacity in scientific reasoning tasks.\n\nOverall, WildSci provides the structured data and scalable evaluation framework necessary for effective training and assessment of LLMs in complex scientific domains, thereby accelerating progress that was previously hindered by data scarcity and task complexity."
}
}
]
@@ -0,0 +1,19 @@
{
"time_stats": {
"fs_explorer": 11.80861964225769,
"rag": 9.905552530288697,
"best": "rag"
},
"llm_stats": {
"fs_explorer": {
"correctness": 9,
"relevance": 10
},
"rag": {
"correctness": 9.6,
"relevance": 10
},
"best_correctness": "rag",
"best_relevance": "rag"
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,27 @@
[
{
"question": "What is MEDS-OWL and what problem does it solve in clinical machine learning?",
"answer": "MEDS-OWL is a lightweight OWL ontology that bridges the Medical Event Data Standard (MEDS) with the Semantic Web ecosystem. It solves the problem that MEDS, while providing a standardized event-centric data model for machine learning workflows, lacks native integration with Semantic Web technologies. MEDS-OWL enables MEDS datasets to be represented as semantically explicit RDF graphs through formal concepts and relations, improving interoperability, reproducibility, and FAIR alignment of clinical data. The ontology works alongside meds2rdf, a Python library that performs the actual transformation of MEDS events into RDF graphs that conform to the ontology's specifications.",
"file": "texts/Clinical Data Goes MEDS? Let's OWL make sense of it.txt"
},
{
"question": "What is Enhanced-FQL(λ) and what are its key innovations for continuous control tasks?",
"answer": "Enhanced-FQL(λ) is a fuzzy reinforcement learning framework that integrates Fuzzified Eligibility Traces (FET) and Segmented Experience Replay (SER) into fuzzy Q-learning with a Fuzzified Bellman Equation (FBE) for continuous control tasks. Its key innovations are: (1) a fuzzified Bellman equation with eligibility traces that enables stable multi-step credit assignment, and (2) a memory-efficient segment-based experience replay mechanism that enhances sample efficiency. The framework uses an interpretable fuzzy rule base instead of complex neural networks, achieving superior sample efficiency and reduced variance compared to fuzzy TD and SARSA(λ) baselines, while maintaining lower computational complexity than deep RL methods like DDPG. It provides theoretical convergence guarantees and is particularly suited for safety-critical applications requiring transparency and computational efficiency.",
"file": "texts/Enhanced-FQL($λ$), an Efficient and Interpretable RL with novel Fuzzy Eligibility Traces and Segmented Experience Replay.txt"
},
{
"question": "What is the two-sort weighted modal logic introduced in the pape talking about possibilistic reasoning with fuzzy formal contexts and what are its key features for reasoning with fuzzy formal contexts?",
"answer": "The paper introduces a two-sort weighted modal logic for possibilistic reasoning with fuzzy formal contexts, featuring two types of weighted modal operators corresponding to classical necessity (□) and sufficiency (⊟) modalities. The logic's formulas are interpreted in fuzzy formal contexts based on possibility theory. Key features include: (1) a sound axiomatization with respect to all fuzzy context models, with both necessity and sufficiency fragments being individually complete; (2) the ability to represent generalized versions of three main FCA notions—formal concepts, object oriented concepts, and property oriented concepts—as their corresponding c-cut concepts in fuzzy formal contexts; and (3) extensibility to reasoning with multi-relational fuzzy contexts that allow Boolean combinations of different fuzzy relations. The logic provides expressive power for possibilistic reasoning while maintaining formal completeness guarantees.",
"file": "texts/A Modal Logic for Possibilistic Reasoning with Fuzzy Formal Contexts.txt"
},
{
"question": "What is JMedEthicBench and what key findings does it reveal about medical safety in Large Language Models?",
"answer": "JMedEthicBench is the first multi-turn conversational benchmark for evaluating medical safety of LLMs in Japanese healthcare, based on 67 guidelines from the Japan Medical Association and containing over 50,000 adversarial conversations generated using seven automatically discovered jailbreak strategies. Key findings from evaluating 27 models include: (1) commercial models maintain robust safety while medical-specialized models show increased vulnerability; (2) safety scores decline significantly across conversation turns (median dropping from 9.5 to 5.0, p < 0.001); and (3) cross-lingual evaluation reveals that medical model vulnerabilities persist across both Japanese and English versions, indicating inherent alignment limitations rather than language-specific factors. These results suggest that domain-specific fine-tuning may inadvertently weaken safety mechanisms and that multi-turn interactions represent a distinct threat surface requiring dedicated alignment strategies.",
"file": "texts/JMedEthicBench: A Multi-Turn Conversational Benchmark for Evaluating Medical Safety in Japanese Large Language Models.txt"
},
{
"question": "What is Unified Thinker and how does it address the reasoning-execution gap in image generation models?",
"answer": "Unified Thinker is a task-agnostic reasoning architecture for general image generation that addresses the reasoning-execution gap by decoupling a dedicated Thinker module from the image Generator. This modular design allows reasoning upgrades without retraining the entire generative model. The framework decomposes high-level intents into grounded, verifiable plans that directly steer the generative process, functioning as a unified planning core that can plug into diverse generators and workflows. It employs a two-stage training paradigm: first building a structured planning interface for the Thinker, then applying reinforcement learning to ground its policy in pixel-level feedback, encouraging plans that optimize visual correctness over textual plausibility. Extensive experiments on text-to-image generation and image editing demonstrate that Unified Thinker substantially improves both image reasoning and generation quality, helping to close the gap between open-source models and closed-source systems like Nano Banana.",
"file": "texts/Unified Thinker: A General Reasoning Modular Core for Image Generation.txt"
}
]
@@ -0,0 +1,60 @@
# Evaluation Results Report
## Summary
Total tasks evaluated: **5**
---
## Time Performance
Average execution time across all tasks:
| Framework | Average Time (seconds) | Status |
|-----------|------------------------|--------|
| Agentic File Search with FileSystem Tools (fs-explorer) | 33.00s | |
| **Traditional RAG** | 8.04s | **Faster** |
**Winner:** Traditional RAG (24.96s faster)
---
## LLM Evaluation Metrics
### Correctness Scores
| Framework | Average Score | Status |
|-----------|---------------|--------|
| Agentic File Search with FileSystem Tools (fs-explorer) | 9.20 | |
| **Traditional RAG** | 9.25 | **Higher** |
**Winner:** Traditional RAG (+0.05 points)
### Relevance Scores
| Framework | Average Score | Status |
|-----------|---------------|--------|
| Agentic File Search with FileSystem Tools (fs-explorer) | 10.00 | |
| **Traditional RAG** | 10.00 | **Higher** |
**Winner:** Traditional RAG (+0.00 points)
---
## Overall Comparison
| Metric | FS-Explorer | RAG | Winner |
|--------|-------------|-----|--------|
| **Speed** | 33.00s | 8.04s | Traditional RAG |
| **Correctness** | 9.20 | 9.25 | Traditional RAG |
| **Relevance** | 10.00 | 10.00 | Traditional RAG |
---
## Key Takeaways
- **Fastest Framework:** Traditional RAG is 24.96s faster on average
- **Most Correct:** Traditional RAG produces more correct answers
- **Most Relevant:** Traditional RAG produces more relevant answers
**Overall Winner:** RAG (3/3 metrics)
@@ -0,0 +1,217 @@
[
{
"task": {
"question": "What is MEDS-OWL and what problem does it solve in clinical machine learning?",
"answer": "MEDS-OWL is a lightweight OWL ontology that bridges the Medical Event Data Standard (MEDS) with the Semantic Web ecosystem. It solves the problem that MEDS, while providing a standardized event-centric data model for machine learning workflows, lacks native integration with Semantic Web technologies. MEDS-OWL enables MEDS datasets to be represented as semantically explicit RDF graphs through formal concepts and relations, improving interoperability, reproducibility, and FAIR alignment of clinical data. The ontology works alongside meds2rdf, a Python library that performs the actual transformation of MEDS events into RDF graphs that conform to the ontology's specifications.",
"file": "texts/Clinical Data Goes MEDS? Let's OWL make sense of it.txt"
},
"tool_calls": [
"read",
"glob",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 9,
"reason": "The answer accurately defines MEDS-OWL as a lightweight OWL ontology bridging MEDS with the Semantic Web and correctly states the core problem it solves: MEDS lacks native Semantic Web/RDF integration, limiting interoperability. It also correctly mentions RDF graph representation and FAIR alignment. It slightly diverges from the ground truth by adding claims like 'provenance-aware publishing' and 'graph-based analytics' (plausible but not specified) and omits the explicit mention of meds2rdf as the companion transformation library, hence not a perfect score."
},
"rag": null
},
"used_files": {
"fs_explorer": [
"./metadata.jsonl",
"./texts/Clinical Data Goes MEDS? Let's OWL make sense of it.txt"
],
"rag": null
},
"has_error": {
"fs_explorer": false,
"rag": true
},
"time_taken": {
"fs_explorer": 18.111814260482788,
"rag": 3.3658759593963623
},
"answers": {
"fs_explorer": "MEDS-OWL is a lightweight OWL (Web Ontology Language) ontology designed to bridge the Medical Event Data Standard (MEDS) with the Semantic Web ecosystem. It addresses the problem of limited interoperability and lack of natively semantic representation in clinical machine learning by providing formal concepts and relations that enable representing MEDS datasets as RDF graphs. This allows for data transformation into FAIR-aligned datasets, provenance-aware publishing, and better integration with graph-based analytics.",
"rag": null
}
},
{
"task": {
"question": "What is Enhanced-FQL(\u03bb) and what are its key innovations for continuous control tasks?",
"answer": "Enhanced-FQL(\u03bb) is a fuzzy reinforcement learning framework that integrates Fuzzified Eligibility Traces (FET) and Segmented Experience Replay (SER) into fuzzy Q-learning with a Fuzzified Bellman Equation (FBE) for continuous control tasks. Its key innovations are: (1) a fuzzified Bellman equation with eligibility traces that enables stable multi-step credit assignment, and (2) a memory-efficient segment-based experience replay mechanism that enhances sample efficiency. The framework uses an interpretable fuzzy rule base instead of complex neural networks, achieving superior sample efficiency and reduced variance compared to fuzzy TD and SARSA(\u03bb) baselines, while maintaining lower computational complexity than deep RL methods like DDPG. It provides theoretical convergence guarantees and is particularly suited for safety-critical applications requiring transparency and computational efficiency.",
"file": "texts/Enhanced-FQL($\u03bb$), an Efficient and Interpretable RL with novel Fuzzy Eligibility Traces and Segmented Experience Replay.txt"
},
"tool_calls": [
"read",
"glob",
"glob",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 9,
"reason": "The proposed answer accurately captures the definition of Enhanced-FQL(\u03bb) and its two main innovations (FBE+FET for multi-step credit assignment and SER for memory-efficient, sample-efficient replay) and correctly situates it in continuous control with an interpretable fuzzy rule base. It omits some secondary ground-truth details (e.g., explicit mention of integration into fuzzy Q-learning, comparative claims vs. fuzzy TD/SARSA(\u03bb) and DDPG, convergence guarantees, and safety-critical transparency emphasis), so it is slightly incomplete but still essentially correct and highly relevant."
},
"rag": {
"relevance": 10,
"correctness": 9,
"reason": "The answer captures the core definition and both key innovations (FET and SER), including interpretability via fuzzy rules, improved sample efficiency/reduced variance vs fuzzy TD/SARSA(\u03bb), and lower complexity than deep RL (e.g., DDPG) with suitability for transparency/safety-critical settings. Minor omissions/less explicit points vs the ground truth are the explicit mention of the Fuzzified Bellman Equation (FBE) as a named component and the note about theoretical convergence guarantees."
}
},
"used_files": {
"fs_explorer": [
"metadata.jsonl",
"./texts/Enhanced-FQL($\u03bb$), an Efficient and Interpretable RL with novel Fuzzy Eligibility Traces and Segmented Experience Replay.txt"
],
"rag": null
},
"has_error": {
"fs_explorer": false,
"rag": false
},
"time_taken": {
"fs_explorer": 61.18818807601929,
"rag": 6.881592750549316
},
"answers": {
"fs_explorer": "Enhanced-FQL(\u03bb) is a fuzzy reinforcement learning framework designed for continuous control tasks that utilizes an interpretable fuzzy rule base rather than complex neural architectures. Its key innovations include: 1) A Fuzzified Bellman Equation (FBE) integrated with novel Fuzzified Eligibility Traces (FET) for stable multi-step credit assignment, and 2) a Segmented Experience Replay (SER) mechanism that improves sample efficiency while remaining memory-efficient.",
"rag": "Enhanced-FQL(\u03bb) is a fuzzy reinforcement learning framework designed for continuous control tasks. Instead of using complex neural networks, it relies on an interpretable fuzzy rule base to guide decision-making. The key innovations of Enhanced-FQL(\u03bb) are:\n\n1. **Fuzzified Eligibility Traces (FET):** This technique enables more stable multi-step credit assignment in the learning process by incorporating a fuzzified version of eligibility traces into the Bellman equation. This helps the agent learn more efficiently from sequences of experiences.\n\n2. **Segmented Experience Replay (SER):** Enhanced-FQL(\u03bb) introduces a memory-efficient, segment-based mechanism for replaying past experiences. This leads to improved sample efficiency by allowing the agent to better reuse relevant past data for learning.\n\nTogether, these innovations provide Enhanced-FQL(\u03bb) with superior sample efficiency and reduced variance compared to earlier fuzzy reinforcement learning methods like n-step fuzzy TD and fuzzy SARSA(\u03bb). Additionally, the algorithm maintains substantially lower computational complexity than deep reinforcement learning approaches (such as DDPG), making it well-suited for applications where interpretability, efficiency, and transparency are critical."
}
},
{
"task": {
"question": "What is the two-sort weighted modal logic introduced in the pape talking about possibilistic reasoning with fuzzy formal contexts and what are its key features for reasoning with fuzzy formal contexts?",
"answer": "The paper introduces a two-sort weighted modal logic for possibilistic reasoning with fuzzy formal contexts, featuring two types of weighted modal operators corresponding to classical necessity (\u25a1) and sufficiency (\u229f) modalities. The logic's formulas are interpreted in fuzzy formal contexts based on possibility theory. Key features include: (1) a sound axiomatization with respect to all fuzzy context models, with both necessity and sufficiency fragments being individually complete; (2) the ability to represent generalized versions of three main FCA notions\u2014formal concepts, object oriented concepts, and property oriented concepts\u2014as their corresponding c-cut concepts in fuzzy formal contexts; and (3) extensibility to reasoning with multi-relational fuzzy contexts that allow Boolean combinations of different fuzzy relations. The logic provides expressive power for possibilistic reasoning while maintaining formal completeness guarantees.",
"file": "texts/A Modal Logic for Possibilistic Reasoning with Fuzzy Formal Contexts.txt"
},
"tool_calls": [
"read",
"glob",
"glob",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 9,
"reason": "The answer closely matches the ground truth: it identifies the two-sort weighted modal logic for possibilistic reasoning in fuzzy formal contexts, mentions the two weighted modalities (necessity and sufficiency), sound axiomatization with fragment-wise completeness, representation of c-cut versions of key FCA notions, and extensibility to multi-relational contexts with Boolean combinations. Minor issues: the sufficiency symbol is rendered differently (\u22a0 vs \u229f in the ground truth), but the meaning is preserved; otherwise it is accurate and complete."
},
"rag": {
"relevance": 10,
"correctness": 9,
"reason": "The response closely matches the ground truth: it identifies the two-sort weighted modal logic for possibilistic reasoning in fuzzy formal contexts, correctly names the two weighted modalities (necessity \u25a1 and sufficiency \u229f/boxminus), mentions fuzzy-context/possibility-theory semantics, sound axiomatization with fragment-wise completeness, representation of FCA notions via c-cut concepts, and extensibility to multi-relational contexts with Boolean combinations. Minor issues: it doesn\u2019t explicitly emphasize \u201ctwo-sort\u201d as two domains (objects vs attributes) beyond \u201ctwo-sorted modality,\u201d and slightly overstates general completeness (it\u2019s fragment-wise, not necessarily full)."
}
},
"used_files": {
"fs_explorer": [
"./metadata.jsonl",
"./texts/A Modal Logic for Possibilistic Reasoning with Fuzzy Formal Contexts.txt"
],
"rag": null
},
"has_error": {
"fs_explorer": false,
"rag": false
},
"time_taken": {
"fs_explorer": 26.732245206832886,
"rag": 9.501526117324829
},
"answers": {
"fs_explorer": "The two-sort weighted modal logic introduced for possibilistic reasoning with fuzzy formal contexts is a logical framework designed to interpret formulas within fuzzy formal contexts based on possibility theory. \n\nIts key features for reasoning with fuzzy formal contexts include:\n- **Weighted Modal Operators**: The syntax incorporates two types of weighted modal operators corresponding to classical necessity (\u25a1) and sufficiency (\u22a0) modalities.\n- **Axiomatization and Completeness**: It features a sound axiomatization relative to all fuzzy context models, with both its necessity and sufficiency fragments being individually complete.\n- **Representation of FCA Notions**: The logic can represent generalized $c$-cut versions of three main notions in Formal Concept Analysis (FCA): formal concepts, object-oriented concepts, and property-oriented concepts.\n- **Extensibility**: It can be extended to handle multi-relational fuzzy contexts, allowing for Boolean combinations of different fuzzy relations.",
"rag": "The two-sort weighted modal logic described in the paper is a logical system designed specifically for reasoning about fuzzy formal contexts using tools from possibility theory. This logic extends classical modal logic by introducing two distinct types of weighted modal operators: one for necessity (denoted $\\Box$) and one for sufficiency (denoted $\\boxminus$). These operators are tailored to capture possibility-based reasoning in settings where information is not crisp but graded (fuzzy).\n\nKey features of this logic for reasoning with fuzzy formal contexts include:\n\n1. **Two-Sorted Modality:** It distinguishes between two types of modal necessity: necessity ($\\Box$) and sufficiency ($\\boxminus$), reflecting different ways we can assess the strength or relevance of information in a fuzzy context.\n2. **Interpretation in Fuzzy Contexts:** Formulas in this logic are interpreted over fuzzy formal contexts, where both objects and attributes (properties) may have degrees of association, and possibility theory provides the underlying semantics.\n3. **Sound and (Fragment-wise) Complete Axiomatization:** The logic's axioms are sound for all fuzzy context models, and each modal fragment (necessity or sufficiency) is complete\u2014meaning all semantically valid statements in the fragment are derivable\u2014relative to the fuzzy context framework.\n4. **Expressive Power:** The logic can represent various types of 'concepts' central to formal concept analysis (FCA)\u2014such as formal concepts, object-oriented concepts, and property-oriented concepts\u2014generalized to fuzzy settings as $c$-cut concepts.\n5. **Generalization to Multi-relational Contexts:** The logic can be extended to handle multiple fuzzy relations and their Boolean combinations, supporting more complex relational reasoning scenarios.\n\nIn summary, this logic provides a formal, possibility-theoretic language for expressing and reasoning about nuanced, graded relationships in fuzzy formal contexts, making it a significant tool for fields like data analysis, knowledge representation, and AI where information is often uncertain or imprecise."
}
},
{
"task": {
"question": "What is JMedEthicBench and what key findings does it reveal about medical safety in Large Language Models?",
"answer": "JMedEthicBench is the first multi-turn conversational benchmark for evaluating medical safety of LLMs in Japanese healthcare, based on 67 guidelines from the Japan Medical Association and containing over 50,000 adversarial conversations generated using seven automatically discovered jailbreak strategies. Key findings from evaluating 27 models include: (1) commercial models maintain robust safety while medical-specialized models show increased vulnerability; (2) safety scores decline significantly across conversation turns (median dropping from 9.5 to 5.0, p < 0.001); and (3) cross-lingual evaluation reveals that medical model vulnerabilities persist across both Japanese and English versions, indicating inherent alignment limitations rather than language-specific factors. These results suggest that domain-specific fine-tuning may inadvertently weaken safety mechanisms and that multi-turn interactions represent a distinct threat surface requiring dedicated alignment strategies.",
"file": "texts/JMedEthicBench: A Multi-Turn Conversational Benchmark for Evaluating Medical Safety in Japanese Large Language Models.txt"
},
"tool_calls": [
"read",
"glob",
"glob",
"glob",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 10,
"reason": "The answer accurately defines JMedEthicBench (Japanese, multi-turn, 67 JMA guidelines, 50k+ adversarial conversations, seven jailbreak strategies) and correctly summarizes the key findings (commercial vs medical-specialized robustness, significant multi-turn safety degradation with the stated medians, cross-lingual persistence implying inherent alignment limits, and fine-tuning/multi-turn threat-surface implications). It closely matches the ground truth with no substantive omissions or errors."
},
"rag": {
"relevance": 10,
"correctness": 10,
"reason": "The answer matches the ground truth on all core points: it defines JMedEthicBench as the first multi-turn Japanese medical safety benchmark, cites the 67 JMA guidelines, >50k adversarial conversations, and seven jailbreak strategies. It accurately summarizes the key evaluation findings across 27 models: commercial robustness vs medical-model vulnerability, significant safety degradation over turns (median 9.5\u21925.0), cross-lingual persistence of vulnerabilities, and the implication that domain fine-tuning can weaken safety and that multi-turn is a distinct threat surface. Minor differences are only in phrasing and adding a clearly consistent implication."
}
},
"used_files": {
"fs_explorer": [
"./metadata.jsonl",
"./texts/JMedEthicBench: A Multi-Turn Conversational Benchmark for Evaluating Medical Safety in Japanese Large Language Models.txt"
],
"rag": null
},
"has_error": {
"fs_explorer": false,
"rag": false
},
"time_taken": {
"fs_explorer": 32.922569036483765,
"rag": 6.84380578994751
},
"answers": {
"fs_explorer": "JMedEthicBench is the first multi-turn conversational benchmark designed to evaluate the medical safety of Large Language Models (LLMs) in the context of Japanese healthcare. It is based on 67 guidelines from the Japan Medical Association and includes over 50,000 adversarial conversations generated via seven jailbreak strategies.\n\nKey findings revealed by the benchmark include:\n1. Model Vulnerability: While commercial models generally maintain robust safety, medical-specialized models exhibit increased vulnerability.\n2. Safety Decay in Conversations: Safety scores decline significantly as conversation turns progress (median scores dropping from 9.5 to 5.0).\n3. Cross-lingual Persistence: Vulnerabilities in medical models persist across both Japanese and English versions of the benchmark, suggesting inherent alignment limitations rather than language-specific issues.\n4. Impact of Fine-tuning: Domain-specific fine-tuning may unintentionally weaken safety mechanisms, and multi-turn interactions represent a distinct threat surface that current alignment strategies may not adequately address.",
"rag": "JMedEthicBench is a multi-turn conversational benchmark specifically designed to evaluate the medical safety of Large Language Models (LLMs) in the context of Japanese healthcare. It is notable for being the first benchmark of its kind focused on multi-turn (i.e., ongoing conversation) interactions, as opposed to single-turn prompts, and for being based on real Japanese clinical guidelines (67 from the Japan Medical Association). The dataset contains over 50,000 adversarial conversations generated using seven different jailbreak strategies.\n\nKey findings from evaluations using JMedEthicBench are:\n1. Commercial LLMs generally maintain robust safety when responding to adversarial or potentially unsafe medical queries.\n2. Medical-specialized LLMs, those fine-tuned specific to healthcare tasks, show increased vulnerability and are more likely to violate medical safety guidelines, especially in adversarial scenarios.\n3. Safety scores for all models drop significantly over the course of multi-turn conversations (median falling from 9.5 to 5.0), highlighting that ongoing dialogues pose a greater risk for unsafe responses.\n4. Cross-lingual testing in both Japanese and English shows that these vulnerabilities are not just language-specific, indicating fundamental alignment limitations in current LLMs.\n5. Domain-specific fine-tuning can unintentionally weaken the safety mechanisms of LLMs, and multi-turn interactions should be addressed as a distinct and critical threat surface for model alignment."
}
},
{
"task": {
"question": "What is Unified Thinker and how does it address the reasoning-execution gap in image generation models?",
"answer": "Unified Thinker is a task-agnostic reasoning architecture for general image generation that addresses the reasoning-execution gap by decoupling a dedicated Thinker module from the image Generator. This modular design allows reasoning upgrades without retraining the entire generative model. The framework decomposes high-level intents into grounded, verifiable plans that directly steer the generative process, functioning as a unified planning core that can plug into diverse generators and workflows. It employs a two-stage training paradigm: first building a structured planning interface for the Thinker, then applying reinforcement learning to ground its policy in pixel-level feedback, encouraging plans that optimize visual correctness over textual plausibility. Extensive experiments on text-to-image generation and image editing demonstrate that Unified Thinker substantially improves both image reasoning and generation quality, helping to close the gap between open-source models and closed-source systems like Nano Banana.",
"file": "texts/Unified Thinker: A General Reasoning Modular Core for Image Generation.txt"
},
"tool_calls": [
"read",
"glob",
"read"
],
"llm_evaluations": {
"fs_explorer": {
"relevance": 10,
"correctness": 9,
"reason": "The answer closely matches the ground truth: it describes Unified Thinker as a task-agnostic, modular reasoning/planning core decoupled from the generator; explains decomposing intents into grounded, verifiable plans to bridge the reasoning-execution gap; and mentions the two-stage training with RL using pixel-level feedback to optimize visual correctness over textual plausibility. The only notable omission is the explicit mention of experimental results/closing the gap vs. closed-source systems (e.g., Nano Banana), which is why it\u2019s not a perfect 10 for correctness."
},
"rag": {
"relevance": 10,
"correctness": 9,
"reason": "The answer closely matches the ground truth: it identifies Unified Thinker as a task-agnostic, modular reasoning architecture that decouples a Thinker from a Generator to bridge the reasoning\u2013execution gap; it describes intent decomposition into grounded/verifiable plans; it notes the two-stage training with RL using pixel-level feedback; and it cites improvements on text-to-image and editing, including closing the gap with closed-source systems. Minor omissions include not explicitly calling it a 'unified planning core' that can plug into diverse generators/workflows and not mentioning the specific comparison name 'Nano Banana,' but these are small details."
}
},
"used_files": {
"fs_explorer": [
"metadata.jsonl",
"./texts/Unified Thinker: A General Reasoning Modular Core for Image Generation.txt"
],
"rag": null
},
"has_error": {
"fs_explorer": false,
"rag": false
},
"time_taken": {
"fs_explorer": 26.033082962036133,
"rag": 13.592315912246704
},
"answers": {
"fs_explorer": "Unified Thinker is a task-agnostic reasoning architecture designed as a unified planning core that can be integrated into various image generators and workflows. It addresses the reasoning-execution gap in image generation models by employing 'executable reasoning,' which involves decomposing high-level user intents into grounded, verifiable plans that directly guide the generation process. Key aspects of this approach include decoupling the dedicated 'Thinker' from the image generator to allow for modular upgrades, and utilizing a two-stage training paradigm that first establishes a structured planning interface and then uses reinforcement learning with pixel-level feedback to ensure plans optimize for actual visual correctness rather than just textual plausibility.",
"rag": "Unified Thinker is a task-agnostic reasoning architecture designed for general image generation. Its key contribution is to address the reasoning\u2013execution gap that persists in current generative models, which often struggle with logic-intensive instruction following. Unlike traditional generative models that directly map instructions to images, Unified Thinker introduces a modular approach by separating the reasoning (\"Thinker\") from the generative (\"Generator\") components.\n\nThe Thinker is capable of decomposing high-level user intents into grounded, executable plans, which can be verified and used to guide the generation process more effectively. This modularity allows upgrades and improvements to the reasoning component without the need to retrain the entire image generator.\n\nTraining is carried out in two stages: first, a planning interface for the Thinker is built; then reinforcement learning is applied to align the Thinker\u2019s policies with pixel-level feedback, promoting plans that produce visually correct images (as opposed to just plausible images from a textual standpoint). Experiments demonstrate that this architecture leads to substantial improvements in reasoning-driven image generation and editing, helping to close the gap compared to high-performing closed-source systems."
}
}
]
@@ -0,0 +1,7 @@
for i in {2..5}
do
echo "Processing ${i}/5"
run-eval -df question_and_answers_${i}.json --advanced --output results_${i}.json
echo "Sleeping..."
sleep 60
done
@@ -0,0 +1,19 @@
{
"time_stats": {
"fs_explorer": 32.99757990837097,
"rag": 8.037023305892944,
"best": "rag"
},
"llm_stats": {
"fs_explorer": {
"correctness": 9.2,
"relevance": 10
},
"rag": {
"correctness": 9.25,
"relevance": 10
},
"best_correctness": "rag",
"best_relevance": "rag"
}
}
@@ -0,0 +1,31 @@
if command -v arxiv-cli &> /dev/null; then
echo "arxiv-cli found, proceeding..."
else
echo "arxiv-cli not found, installing..."
if command -v cargo &> /dev/null; then
echo "Installing with cargo..."
cargo install arxiv-cli
else
echo "Cargo not found, attempting with npm..."
if command -v npm &> /dev/null; then
echo "Installing with npm..."
npm install @cle-does-things/arxiv-cli
else
echo "NPM not found, cannot install arxiv-cli"
exit 1
fi
fi
fi
if [ -d "arxiv-1000-papers/texts" ]; then
rm -rf arxiv-1000-papers/texts
fi
mkdir -p arxiv-100-papers/
cd arxiv-100-papers/
echo "Download 100 most recent AI papers"
arxiv-cli --category cs.AI --limit 100 --summary
echo "Done!"
@@ -0,0 +1,31 @@
if command -v arxiv-cli &> /dev/null; then
echo "arxiv-cli found, proceeding..."
else
echo "arxiv-cli not found, installing..."
if command -v cargo &> /dev/null; then
echo "Installing with cargo..."
cargo install arxiv-cli
else
echo "Cargo not found, attempting with npm..."
if command -v npm &> /dev/null; then
echo "Installing with npm..."
npm install @cle-does-things/arxiv-cli
else
echo "NPM not found, cannot install arxiv-cli"
exit 1
fi
fi
fi
if [ -d "arxiv-1000-papers/texts" ]; then
rm -rf arxiv-1000-papers/texts
fi
mkdir -p arxiv-1000-papers/
cd arxiv-1000-papers/
echo "Download 1000 most recent AI papers"
arxiv-cli --category cs.AI --limit 1000 --summary
echo "Done!"
+3
View File
@@ -0,0 +1,3 @@
# cache-arxiv
Utility package to cache arXiv papers summaries in order to better connect them to downstream processing pipelines.
+21
View File
@@ -0,0 +1,21 @@
[build-system]
requires = ["uv_build>=0.9.10,<0.10.0"]
build-backend = "uv_build"
[project]
name = "cache-arxiv"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"diskcache>=5.6.3",
"typer>=0.20.0",
]
[tool.uv.build-backend]
module-name = "cache_arxiv"
[project.scripts]
cache-arxiv = "cache_arxiv.main:app"
@@ -0,0 +1,36 @@
from typer import Typer, Option
from typing import Annotated
from .utils import cache_texts
app = Typer()
@app.command()
def main(
directory: Annotated[
str | None,
Option(
"-d",
"--directory",
help="Base directory where the arXiv papers and metadata are stored. Defaults to the current working directory.",
),
] = None,
cache_directory: Annotated[
str | None,
Option(
"-c",
"--cache-dir",
help="Sub-directory where to persistently cache arXiv paper texts. Defaults to `tmp/cache`.",
),
] = None,
texts_directory: Annotated[
str | None,
Option(
"-t",
"--texts-dir",
help="Sub-directory where the texts from the arXiv papers summaries are stored. Defaults to `texts`.",
),
] = None,
) -> None:
cache_texts(directory, cache_directory, texts_directory)
@@ -0,0 +1,23 @@
from diskcache import Cache
from pathlib import Path
BASE_DIRECTORY = "."
CACHE_DIRECTORY = "tmp/cache"
TEXTS_DIRECTORY = "texts"
def cache_texts(
base_dir: str | None = None,
cache_directory: str | None = None,
texts_directory: str | None = None,
) -> None:
cache = Cache(directory=cache_directory or CACHE_DIRECTORY)
files_dir = Path((base_dir or BASE_DIRECTORY)) / (
texts_directory or TEXTS_DIRECTORY
)
for root, _, files in files_dir.walk():
for file in files:
path = root / file
text = path.read_text()
cache.add(str(path.resolve()), text)
return None
+57 -1
View File
@@ -2,7 +2,7 @@
Framework to evaluate the performance of the fs-explorer agent (agentic file search with filesystem tools) agains the performance of traditional RAG.
## Run the evaluation
## Run the evaluation (small scale)
> This assumes that you have `fs-explorer` and `rag-starterkit` installed
@@ -34,4 +34,60 @@ This will produce a [`results.json`](../../data/benchmark/results.json) file (co
```bash
get-stats
```
## Run the evaluation (at scale)
The evaluation can be brought to higher scales by using 100 or 1000 text-based abstracts from the most recent AI-related papers on arXiv.
In order to get the data:
- Go to the benchmark directory:
```bash
cd benchmarks
```
- Run the scripts to collect 100 and 1000 arXiv papers:
```bash
bash scripts/download_arxiv_100.sh
bash scripts/download_arxiv_1000.sh
```
- The previous step will create a `texts/` directory under both `arxiv-100-papers` and `arxiv-1000-papers`, as well as a `metadata.jsonl` file. Before running the evaluation on the existing `question_and_answers.json` file, we suggest you check for the existence of the files mentioned in the Q&A pairs, and, if they don't exist, you should create a new set of Q&A.
- Cache the papers in both the directories:
```bash
# install the cache-arxiv package first, under the packages/ directory
cd arxiv-100-papers
cache-arxiv
cd ../arxiv-1000-papers
cache-arxiv
cd ..
```
- Now run the evaluation on `arxiv-100-papers`:
```bash
cd arxiv-100-papers
run-eval -df question_and_answers.json --advanced
get-stats
cd ..
```
- Once the evaluation is done and you collected the statistics, remove the `rag-benchmark-advanced` collection from Qdrant (we will be re-using it for the next experiment):
```bash
curl -X DELETE \
'http://localhost:6333/collections/rag-benchmark-advanced'
```
- Then, head to `arxiv-1000-papers`, and run the evaluation (you might need to split the tasks into sub-tasks because of rate limiting):
```bash
cd arxiv-1000-papers
run-eval -df question_and_answers.json --advanced
get-stats
```
@@ -112,14 +112,18 @@ def get_evaluation_dataset(dataset_file: str) -> list[EvalTask]:
return eval_tasks
async def run_evaluation(dataset_file: str, results_file: str = "results.json") -> None:
async def run_evaluation(
dataset_file: str, results_file: str = "results.json", advanced: bool = False
) -> None:
tasks = get_evaluation_dataset(dataset_file)
results: list[EvalResult] = []
try:
for i, task in enumerate(tasks):
print(f"Starting task {i + 1} of {len(tasks)}")
wf_result = await run_workflow(question=task["question"])
pipeline_result = await run_pipeline(question=task["question"])
wf_result = await run_workflow(question=task["question"], advanced=advanced)
pipeline_result = await run_pipeline(
question=task["question"], advanced=advanced
)
best_time = BestTime(
fs_explorer=wf_result["time_taken"], rag=pipeline_result["time_taken"]
)
@@ -19,6 +19,13 @@ def run_evaluations(
help="JSON file containing the questions and answers dataset",
),
],
advanced: Annotated[
bool,
Option(
"--advanced/--no-advanced",
help="Whether or not to run the evaluation in advanced mode.",
),
] = False,
results_file: Annotated[
str,
Option(
@@ -28,7 +35,11 @@ def run_evaluations(
),
] = "results.json",
) -> None:
asyncio.run(run_evaluation(dataset_file=dataset_file, results_file=results_file))
asyncio.run(
run_evaluation(
dataset_file=dataset_file, results_file=results_file, advanced=advanced
)
)
@app_stats.command()
@@ -15,6 +15,13 @@ FS_EXPLORER_PROMPT = Template(
"Search the answer to the following question: '{{question}}' by using one of the PDF files available in the current directory. In your final response, you must report the answer to the question. In this task, you MUST NOT ask for any human assistance and you MUST ONLY use tool calling."
)
FS_EXPLORER_PROMPT_ADVANCED = Template(
"Search the answer to the following question: '{{question}}' by using one of the text files available in the `./texts` directory. In order to understand what file you should be using, please consult the `metadata.jsonl` file. In your final response, you must report the answer to the question. In this task, you MUST NOT ask for any human assistance and you MUST ONLY use tool calling."
)
QDRANT_COLLECTION = "rag-benchmark"
QDRANT_COLLECTION_ADVANCED = "rag-benchmark-advanced"
class RunResult(TypedDict):
time_taken: float
@@ -24,8 +31,13 @@ class RunResult(TypedDict):
file_path: str | list[str] | None
async def run_workflow(question: str) -> RunResult:
start_event = InputEvent(task=FS_EXPLORER_PROMPT.render({"question": question}))
async def run_workflow(question: str, advanced: bool = False) -> RunResult:
if not advanced:
start_event = InputEvent(task=FS_EXPLORER_PROMPT.render({"question": question}))
else:
start_event = InputEvent(
task=FS_EXPLORER_PROMPT_ADVANCED.render({"question": question})
)
tool_calls = []
file_names: list[str] = []
start_time = time.time()
@@ -37,6 +49,14 @@ async def run_workflow(question: str) -> RunResult:
file_name = event.tool_input.get("file_path")
if file_name is not None:
file_names.append(file_name)
if event.tool_name == "read":
file_name = event.tool_input.get("file_path")
if file_name is not None:
file_names.append(file_name)
if event.tool_name == "grep":
file_name = event.tool_input.get("file_path")
if file_name is not None:
file_names.append(file_name)
result = await handler
end_time = time.time()
assert isinstance(result, ExplorationEndEvent)
@@ -51,12 +71,16 @@ async def run_workflow(question: str) -> RunResult:
PIPELINE = Pipeline(
qdrant_client=AsyncQdrantClient(location="http://localhost:6333"),
qdrant_collection_name="rag-benchmark",
qdrant_collection_name=QDRANT_COLLECTION,
cache_directory="tmp/cache",
)
async def run_pipeline(question: str) -> RunResult:
async def run_pipeline(question: str, advanced: bool = False) -> RunResult:
if advanced:
PIPELINE.vector_db.collection_name = QDRANT_COLLECTION_ADVANCED
PIPELINE.vector_db.sparse_only = True
PIPELINE.sparse_only = True
await PIPELINE.prepare()
start_time = time.time()
try:
@@ -16,6 +16,7 @@ class Pipeline:
qdrant_client: AsyncQdrantClient,
qdrant_collection_name: str,
rrf_constant: int = 60,
sparse_embeddings_only: bool = False,
parsing_kwargs: dict[str, Any] | None = None,
cache_directory: str | None = None,
openai_api_key: str | None = None,
@@ -52,9 +53,11 @@ class Pipeline:
collection_name=qdrant_collection_name,
embedder=self.embedder,
rrf_constant=rrf_constant,
sparse_only=sparse_embeddings_only,
)
self.filter_llm = LLMFilter(api_key=openai_api_key, model=openai_llm_model)
self.file_paths: list[str] = []
self.sparse_only = sparse_embeddings_only
self.is_ready = False
async def prepare(self) -> None:
@@ -67,7 +70,8 @@ class Pipeline:
contents = cast(dict[str, str], contents)
self.file_paths = [key for key in contents]
chunks = self.chunker.chunk_texts(contents)
chunks = await self.embedder.embed_chunks(chunks)
if not self.sparse_only:
chunks = await self.embedder.embed_chunks(chunks)
chunks = self.embedder.sparse_embed_chunks(chunks)
await self.vector_db.configure_collection()
await self.vector_db.upload(chunks)
@@ -66,6 +66,8 @@ class SimpleReranker:
sparse_results: list[SearchResult],
limit: int = 1,
) -> list[SearchResult]:
if len(dense_results) == 0:
return sparse_results[:limit]
rrf_scores = self._reciprocal_rank_fusion(dense_results, sparse_results)
results_map = self._dedupe_and_merge(dense_results, sparse_results)
reranked_results: list[SearchResult] = []
@@ -84,27 +86,37 @@ class VectorDB:
collection_name: str,
embedder: Embedder,
rrf_constant: int = 60,
sparse_only: bool = False,
) -> None:
self._client = qdrant_client
self.collection_name = collection_name
self.embedder = embedder
self.sparse_only = sparse_only
self._reranker = SimpleReranker(k=rrf_constant)
async def configure_collection(self) -> None:
if await self._client.collection_exists(self.collection_name):
return None
else:
await self._client.create_collection(
collection_name=self.collection_name,
vectors_config={
"dense-text": VectorParams(size=768, distance=Distance.COSINE)
},
sparse_vectors_config={
"sparse-text": SparseVectorParams(
index=SparseIndexParams(on_disk=False)
)
},
)
vectors_config = {
"dense-text": VectorParams(size=768, distance=Distance.COSINE)
}
sparse_vectors_config = {
"sparse-text": SparseVectorParams(
index=SparseIndexParams(on_disk=False)
)
}
if not self.sparse_only:
await self._client.create_collection(
collection_name=self.collection_name,
vectors_config=vectors_config,
sparse_vectors_config=sparse_vectors_config,
)
else:
await self._client.create_collection(
collection_name=self.collection_name,
sparse_vectors_config=sparse_vectors_config,
)
async def check_if_loaded(self) -> bool:
if not await self._client.collection_exists(self.collection_name):
@@ -124,17 +136,19 @@ class VectorDB:
values=d["sparse_embedding"].values.tolist(),
)
}
dense_embedding = {"dense-text": d["embedding"]}
payload = {"content": d["chunk"].text, "file_path": d["file_path"]}
sparse_embeddings.append(sparse_embedding)
dense_embeddings.append(dense_embedding)
if not self.sparse_only:
dense_embedding = {"dense-text": d["embedding"]}
dense_embeddings.append(dense_embedding)
payload = {"content": d["chunk"].text, "file_path": d["file_path"]}
payloads.append(payload)
self._client.upload_collection(
self.collection_name,
vectors=dense_embeddings,
payload=payloads,
ids=range(len(dense_embeddings)),
)
if not self.sparse_only:
self._client.upload_collection(
self.collection_name,
vectors=dense_embeddings,
payload=payloads,
ids=range(len(dense_embeddings)),
)
self._client.upload_collection(
self.collection_name,
vectors=sparse_embeddings,
@@ -147,20 +161,33 @@ class VectorDB:
async def search(
self, query: str, file_path: str | None = None, limit: int = 1
) -> list[SearchResult]:
dense_embedding = await self.embedder.embed_query(query)
sparse_embedding = self.embedder.sparse_embed_query(query)
dense_results: list[SearchResult] = []
sparse_results: list[SearchResult] = []
if file_path:
filt = Filter(
must=FieldCondition(key="file_path", match=MatchValue(value=file_path))
)
else:
filt = None
result_dense = await self._client.query_points(
collection_name=self.collection_name,
query=dense_embedding,
using="dense-text",
query_filter=filt,
)
if not self.sparse_only:
dense_embedding = await self.embedder.embed_query(query)
result_dense = await self._client.query_points(
collection_name=self.collection_name,
query=dense_embedding,
using="dense-text",
query_filter=filt,
)
for point in result_dense.points:
if point.payload is not None:
result = SearchResult(
id=cast(int, point.id),
content=point.payload.get("content", ""),
file_path=point.payload.get("file_path", ""),
score=point.score,
type="dense",
)
dense_results.append(result)
sparse_embedding = self.embedder.sparse_embed_query(query)
result_sparse = await self._client.query_points(
collection_name=self.collection_name,
query=SparseVector(
@@ -170,18 +197,6 @@ class VectorDB:
using="sparse-text",
query_filter=filt,
)
dense_results: list[SearchResult] = []
sparse_results: list[SearchResult] = []
for point in result_dense.points:
if point.payload is not None:
result = SearchResult(
id=cast(int, point.id),
content=point.payload.get("content", ""),
file_path=point.payload.get("file_path", ""),
score=point.score,
type="dense",
)
dense_results.append(result)
for point in result_sparse.points:
if point.payload is not None:
result = SearchResult(
+1
View File
@@ -23,6 +23,7 @@ module-name = "fs_explorer"
members = [
"packages/rag-starterkit",
"packages/eval-framework",
"packages/cache-arxiv",
]
[tool.uv.sources]
Generated
+16
View File
@@ -8,6 +8,7 @@ resolution-markers = [
[manifest]
members = [
"cache-arxiv",
"eval-framework",
"fs-explorer",
"rag-starterkit",
@@ -157,6 +158,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b4/d6/f9168956276934162ec8d48232f9920f2985ee45aa7602e3c6b4bc203613/banks-2.2.0-py3-none-any.whl", hash = "sha256:963cd5c85a587b122abde4f4064078def35c50c688c1b9d36f43c92503854e7d", size = 29244, upload-time = "2025-07-18T16:28:27.835Z" },
]
[[package]]
name = "cache-arxiv"
version = "0.1.0"
source = { editable = "packages/cache-arxiv" }
dependencies = [
{ name = "diskcache" },
{ name = "typer" },
]
[package.metadata]
requires-dist = [
{ name = "diskcache", specifier = ">=5.6.3" },
{ name = "typer", specifier = ">=0.20.0" },
]
[[package]]
name = "cachetools"
version = "6.2.2"