From 2ab91099f2e2de0a6c44ad357efd47fcd8ff84b9 Mon Sep 17 00:00:00 2001 From: Alessandro Clerici Date: Wed, 9 Jul 2025 15:39:31 +0000 Subject: [PATCH] no longer tracking egg-info --- src/m3docrag.egg-info/PKG-INFO | 215 --------------------- src/m3docrag.egg-info/SOURCES.txt | 31 --- src/m3docrag.egg-info/dependency_links.txt | 1 - src/m3docrag.egg-info/requires.txt | 28 --- src/m3docrag.egg-info/top_level.txt | 1 - 5 files changed, 276 deletions(-) delete mode 100644 src/m3docrag.egg-info/PKG-INFO delete mode 100644 src/m3docrag.egg-info/SOURCES.txt delete mode 100644 src/m3docrag.egg-info/dependency_links.txt delete mode 100644 src/m3docrag.egg-info/requires.txt delete mode 100644 src/m3docrag.egg-info/top_level.txt diff --git a/src/m3docrag.egg-info/PKG-INFO b/src/m3docrag.egg-info/PKG-INFO deleted file mode 100644 index b005234..0000000 --- a/src/m3docrag.egg-info/PKG-INFO +++ /dev/null @@ -1,215 +0,0 @@ -Metadata-Version: 2.2 -Name: m3docrag -Version: 0.0.1 -Summary: Multimodal Document Understanding with RAG -Classifier: Programming Language :: Python :: 3 -Requires-Python: >=3.10 -Description-Content-Type: text/markdown -Requires-Dist: accelerate==1.1.0 -Requires-Dist: loguru -Requires-Dist: requests -Requires-Dist: setuptools==69.5 -Requires-Dist: transformers -Requires-Dist: tokenizers -Requires-Dist: flash-attn==2.5.8 -Requires-Dist: bitsandbytes==0.43.1 -Requires-Dist: safetensors -Requires-Dist: gpustat -Requires-Dist: icecream -Requires-Dist: pdf2image -Requires-Dist: numpy==1.26.4 -Requires-Dist: torchvision -Requires-Dist: jsonlines -Requires-Dist: editdistance -Requires-Dist: einops -Requires-Dist: fire -Requires-Dist: peft -Requires-Dist: timm -Requires-Dist: sentencepiece -Requires-Dist: colpali-engine==0.3.1 -Requires-Dist: easyocr -Requires-Dist: qwen-vl-utils -Requires-Dist: faiss-cpu -Requires-Dist: word2number -Requires-Dist: datasets>=3.0.0 -Requires-Dist: python-dotenv - -# M3DocRAG - -Code for [M3DocRAG: Multi-modal Retrieval is What You Need for Multi-page Multi-document Understanding](https://m3docrag.github.io/) - -by [Jaemin Cho](https://j-min.io/), [Debanjan Mahata](https://sites.google.com/a/ualr.edu/debanjan-mahata/), [Ozan İrsoy](https://wtimesx.com/), [Yujie He](https://scholar.google.com/citations?user=FbeAZGgAAAAJ&hl=en), [Mohit Bansal](https://www.cs.unc.edu/~mbansal/) - -# Summary - -## Comparison with previous approches - - - -Comparison of multi-modal document understanding pipelines. Previous works focus on (a) **Single-page DocVQA** that cannot handle many long documents or (b) **Text-based RAG** that ignores visual information. Our (c) **M3DocRAG** framework retrieves relevant documents and answers questions using multi-modal retrieval and MLM components, so that it can efficiently handle many long documents while preserving visual information. - -## M3DocRAG framework - - - -Our **M3DocRAG** framework consists of three stages: (1) document embedding, (2) page retrieval, and (3) question answering. -- In (1) document embedding, we extract visual embedding (with ColPali) to represent each page from all PDF documents. -- In (2) page retrieval, we retrieve the top-K pages of high relevance (MaxSim scores) with text queries. In an open-domain setting, we create approximate page indices for faster search. -- In (3) question answering, we conduct visual question answering with multi-modal LM (e.g. Qwen2-VL) to obtain the final answer. - - -# Setup - -## Package - -We assume conda has been installed - -```bash -git clone -cd m3docrag-release -pip install -e . - -# Install Poppler (for pdf2image; check https://pdf2image.readthedocs.io/en/latest/installation.html for details) -# conda install -y poppler -# or -# apt-get install poppler-utils -``` - -## Code structure - -```bash -examples/ # scripts to run PDF embedding / RAG -src/m3docrag/ - datasets/ # data loader for existing datasets - retrieval/ # retrieval model (e.g., ColPaLi) - vqa/ # vqa model (e.g., Qwen2-VL) - rag/ # RAG model that combines retrieval and vqa models - utils/ # misc utility methods -m3docvqa/ # how to setup m3docvqa dataset -``` -## Paths: Data, Embeddings, Model checkpoints, Outputs - -```bash -# in .env -LOCAL_DATA_DIR="/job/datasets" # where to store data -LOCAL_EMBEDDINGS_DIR="/job/embeddings" # where to store embeddings -LOCAL_MODEL_DIR="/job/model" # where to store model checkpoints -LOCAL_OUTPUT_DIR="/job/output" # where to store model outputs -``` - -You can adjust variables in [`.env`](.env) to change where to store data/embedding/model checkpoint/outputs by default. They are loaded in [`src/m3docrag/utils/paths.py`](./src/m3docrag/utils/paths.py) via [python-dotenv](https://github.com/theskumar/python-dotenv). - - -## Download M3DocVQA dataset - -Please see [m3docvqa/README.md](m3docvqa/README.md) for the download instruction. - -## Donwload model checkpoints - -By default, we use colpali-v1.2 for retrival and Qwen2-VL-7B-Instruct for question answering. - -At `$LOCAL_MODEL_DIR`, download [colpali-v1.2](https://huggingface.co/vidore/colpali-v1.2), [colpaligemma-3b-mix-448-base](https://huggingface.co/vidore/colpaligemma-3b-mix-448-base) and [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) checkpoints. - -```bash -cd $LOCAL_MODEL_DIR - -git clone https://huggingface.co/vidore/colpaligemma-3b-pt-448-base # ColPali backbone -git clone https://huggingface.co/vidore/colpali-v1.2 # ColPali adapter -git clone https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct # VQA -``` - - - - - -# Example usage - -Below we describe example usage of M3DocRAG on M3DocVQA dataset. - - -## 1. Extract PDF embeddings - -```bash -DATASET_NAME="m3-docvqa" -RETRIEVAL_MODEL_TYPE="colpali" -RETRIEVAL_MODEL_NAME="colpaligemma-3b-pt-448-base" -RETRIEVAL_ADAPTER_MODEL_NAME="colpali-v1.2" -SPLIT="dev" -EMBEDDING_NAME=$RETRIEVAL_ADAPTER_MODEL_NAME"_"$DATASET_NAME"_"$SPLIT # where to save embeddings -accelerate launch --num_processes=1 --mixed_precision=bf16 examples/run_page_embedding.py \ - --use_retrieval \ - --retrieval_model_type=$RETRIEVAL_MODEL_TYPE \ - --data_name=$DATASET_NAME \ - --split=$SPLIT \ - --loop_unique_doc_ids=True \ - --output_dir=/job/embeddings/$EMBEDDING_NAME \ - --retrieval_model_name_or_path=$RETRIEVAL_MODEL_NAME \ - --retrieval_adapter_model_name_or_path=$RETRIEVAL_ADAPTER_MODEL_NAME -``` - -## 2. Indexing - -```bash -DATASET_NAME="m3-docvqa" -RETRIEVAL_MODEL_TYPE="colpali" -RETRIEVAL_ADAPTER_MODEL_NAME="colpali-v1.2" -SPLIT="dev" -FAISS_INDEX_TYPE='ivfflat' -EMBEDDING_NAME=$RETRIEVAL_ADAPTER_MODEL_NAME"_"$DATASET_NAME"_"$SPLIT -INDEX_NAME=$EMBEDDING_NAME"_pageindex_"$FAISS_INDEX_TYPE # where to save resulting index -echo $EMBEDDING_NAME -echo $FAISS_INDEX_TYPE -python examples/run_indexing_m3docvqa.py \ - --use_retrieval \ - --retrieval_model_type=$RETRIEVAL_MODEL_TYPE \ - --data_name=$DATASET_NAME \ - --split=$SPLIT \ - --loop_unique_doc_ids=False \ - --embedding_name=$EMBEDDING_NAME \ - --faiss_index_type=$FAISS_INDEX_TYPE \ - --output_dir=/job/embeddings/$INDEX_NAME -``` - -## 3. RAG - -```bash -BACKBONE_MODEL_NAME="Qwen2-VL-7B-Instruct" -RETRIEVAL_MODEL_TYPE="colpali" -RETRIEVAL_MODEL_NAME="colpaligemma-3b-pt-448-base" -RETRIEVAL_ADAPTER_MODEL_NAME="colpali-v1.2" -EMBEDDING_NAME="colpali-v1.2_m3-docvqa_dev" # from Step 1 Embedding -SPLIT="dev" -DATASET_NAME="m3-docvqa" -FAISS_INDEX_TYPE='ivfflat' -N_RETRIEVAL_PAGES=1 -INDEX_NAME="${EMBEDDING_NAME}_pageindex_$FAISS_INDEX_TYPE" # from Step 2 Indexing -OUTPUT_SAVE_NAME="${RETRIEVAL_ADAPTER_MODEL_NAME}_${BACKBONE_MODEL_NAME}_${DATASET_NAME}" # where to save RAG results -BITS=16 # BITS=4 for 4-bit qunaitzation in low memory GPUs -python examples/run_rag_m3docvqa.py \ - --use_retrieval \ - --retrieval_model_type=$RETRIEVAL_MODEL_TYPE \ - --load_embedding=True \ - --split=$SPLIT \ - --bits=$BITS \ - --n_retrieval_pages=$N_RETRIEVAL_PAGES \ - --data_name=$DATASET_NAME \ - --model_name_or_path=$BACKBONE_MODEL_NAME \ - --embedding_name=$EMBEDDING_NAME \ - --retrieval_model_name_or_path=$RETRIEVAL_MODEL_NAME \ - --retrieval_adapter_model_name_or_path=$RETRIEVAL_ADAPTER_MODEL_NAME \ - --output_dir=/job/eval_outputs/$OUTPUT_SAVE_NAME -``` - - -# Citation - -Please cite our paper if you use our dataset and/or method in your projects. - - -```bibtex -@article{Cho2024M3DocRAG, - author = {Jaemin Cho and Ozan İrsoy and Debanjan Mahata and Yujie He and Mohit Bansal}, - title = {M3DocRAG: Multi-modal Retrieval is What You Need for Multi-page Multi-document Understanding}, - year = {2024}, -} -``` diff --git a/src/m3docrag.egg-info/SOURCES.txt b/src/m3docrag.egg-info/SOURCES.txt deleted file mode 100644 index 83cd2eb..0000000 --- a/src/m3docrag.egg-info/SOURCES.txt +++ /dev/null @@ -1,31 +0,0 @@ -README.md -pyproject.toml -src/m3docrag/__init__.py -src/m3docrag.egg-info/PKG-INFO -src/m3docrag.egg-info/SOURCES.txt -src/m3docrag.egg-info/dependency_links.txt -src/m3docrag.egg-info/requires.txt -src/m3docrag.egg-info/top_level.txt -src/m3docrag/datasets/__init__.py -src/m3docrag/datasets/m3_docvqa/__init__.py -src/m3docrag/datasets/m3_docvqa/common_utils.py -src/m3docrag/datasets/m3_docvqa/dataset.py -src/m3docrag/datasets/m3_docvqa/evaluate.py -src/m3docrag/rag/__init__.py -src/m3docrag/rag/base.py -src/m3docrag/rag/multimodal.py -src/m3docrag/rag/utils.py -src/m3docrag/retrieval/__init__.py -src/m3docrag/retrieval/colpali.py -src/m3docrag/utils/args.py -src/m3docrag/utils/distributed.py -src/m3docrag/utils/paths.py -src/m3docrag/utils/pdfs.py -src/m3docrag/utils/prompts.py -src/m3docrag/utils/tar.py -src/m3docrag/vqa/__init__.py -src/m3docrag/vqa/florence2.py -src/m3docrag/vqa/idefics2.py -src/m3docrag/vqa/idefics3.py -src/m3docrag/vqa/internvl2.py -src/m3docrag/vqa/qwen2.py \ No newline at end of file diff --git a/src/m3docrag.egg-info/dependency_links.txt b/src/m3docrag.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/src/m3docrag.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/m3docrag.egg-info/requires.txt b/src/m3docrag.egg-info/requires.txt deleted file mode 100644 index 2929b2b..0000000 --- a/src/m3docrag.egg-info/requires.txt +++ /dev/null @@ -1,28 +0,0 @@ -accelerate==1.1.0 -loguru -requests -setuptools==69.5 -transformers -tokenizers -flash-attn==2.5.8 -bitsandbytes==0.43.1 -safetensors -gpustat -icecream -pdf2image -numpy==1.26.4 -torchvision -jsonlines -editdistance -einops -fire -peft -timm -sentencepiece -colpali-engine==0.3.1 -easyocr -qwen-vl-utils -faiss-cpu -word2number -datasets>=3.0.0 -python-dotenv diff --git a/src/m3docrag.egg-info/top_level.txt b/src/m3docrag.egg-info/top_level.txt deleted file mode 100644 index 33985ce..0000000 --- a/src/m3docrag.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -m3docrag