In this paper we introduce LifelongMemory, a new framework for accessing long-form egocentric videographic memory through natural language question answering and retrieval. LifelongMemory generates concise video activity descriptions of the camera wearer and leverages the zero-shot capabilities of pretrained large language models to perform reasoning over long-form video context. Furthermore, LifelongMemory uses a confidence and explanation module to produce confident, high-quality, and interpretable answers. Our approach achieves state-of-the-art performance on the EgoSchema benchmark for question answering and is highly competitive on the natural language query (NLQ) challenge of Ego4D.
@misc{wang2024lifelongmemory,
title={LifelongMemory: Leveraging LLMs for Answering Queries in Long-form Egocentric Videos},
author={Ying Wang and Yanlai Yang and Mengye Ren},
year={2024},
eprint={2312.05269},
archivePrefix={arXiv},
primaryClass={cs.CV}
}