From 9fbd757a188aff161922907b07b8354c47343d91 Mon Sep 17 00:00:00 2001 From: Alessandro Clerici Date: Wed, 9 Jul 2025 15:41:46 +0000 Subject: [PATCH] added (for now separated) scripts for steps --- embed.sh | 27 +++++++++++++++++++++++++++ index.sh | 29 +++++++++++++++++++++++++++++ install.sh | 14 ++++++++++++++ rag.sh | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 107 insertions(+) create mode 100755 embed.sh create mode 100755 index.sh create mode 100755 install.sh create mode 100755 rag.sh diff --git a/embed.sh b/embed.sh new file mode 100755 index 0000000..c23a974 --- /dev/null +++ b/embed.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +set -e +. ./.env +mkdir -p $LOCAL_DATA_DIR +mkdir -p $LOCAL_EMBEDDINGS_DIR +mkdir -p $LOCAL_MODEL_DIR +mkdir -p $LOCAL_OUTPUT_DIR +. .venv/bin/activate +set -x + +DATASET_NAME="m3-docvqa" +RETRIEVAL_MODEL_TYPE="colpali" +RETRIEVAL_MODEL_NAME="colpaligemma-3b-pt-448-base" +RETRIEVAL_ADAPTER_MODEL_NAME="colpali-v1.2" +SPLIT="dev" +EMBEDDING_NAME=$RETRIEVAL_ADAPTER_MODEL_NAME"_"$DATASET_NAME"_"$SPLIT # where to save embeddings +# uv tool run accelerate launch --num_processes=1 --mixed_precision=bf16 examples/run_page_embedding.py \ +accelerate launch --num_processes=1 --mixed_precision=bf16 examples/run_page_embedding.py \ + --use_retrieval \ + --retrieval_model_type=$RETRIEVAL_MODEL_TYPE \ + --data_name=$DATASET_NAME \ + --split=$SPLIT \ + --loop_unique_doc_ids=True \ + --output_dir=$LOCAL_EMBEDDINGS_DIR/$EMBEDDING_NAME \ + --retrieval_model_name_or_path=$RETRIEVAL_MODEL_NAME \ + --retrieval_adapter_model_name_or_path=$RETRIEVAL_ADAPTER_MODEL_NAME diff --git a/index.sh b/index.sh new file mode 100755 index 0000000..b960b10 --- /dev/null +++ b/index.sh @@ -0,0 +1,29 @@ +#!/bin/sh + +set -e +. ./.env +mkdir -p $LOCAL_DATA_DIR +mkdir -p $LOCAL_EMBEDDINGS_DIR +mkdir -p $LOCAL_MODEL_DIR +mkdir -p $LOCAL_OUTPUT_DIR +. .venv/bin/activate +set -x + +DATASET_NAME="m3-docvqa" +RETRIEVAL_MODEL_TYPE="colpali" +RETRIEVAL_ADAPTER_MODEL_NAME="colpali-v1.2" +SPLIT="dev" +FAISS_INDEX_TYPE='ivfflat' +EMBEDDING_NAME=$RETRIEVAL_ADAPTER_MODEL_NAME"_"$DATASET_NAME"_"$SPLIT +INDEX_NAME=$EMBEDDING_NAME"_pageindex_"$FAISS_INDEX_TYPE # where to save resulting index +echo $EMBEDDING_NAME +echo $FAISS_INDEX_TYPE +uv run examples/run_indexing_m3docvqa.py \ + --use_retrieval \ + --retrieval_model_type=$RETRIEVAL_MODEL_TYPE \ + --data_name=$DATASET_NAME \ + --split=$SPLIT \ + --loop_unique_doc_ids=False \ + --embedding_name=$EMBEDDING_NAME \ + --faiss_index_type=$FAISS_INDEX_TYPE \ + --output_dir=$LOCAL_EMBEDDINGS_DIR/$INDEX_NAME diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..0b0e829 --- /dev/null +++ b/install.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +set -e +set -x + +# external: +# git git-lfs +# the model checkpoints + +uv venv +uv pip install --extra-index-url https://download.pytorch.org/whl/cu128 torch==2.7.1+cu128 torchvision==0.22.1+cu128 +uv pip install packaging setuptools wheel psutil markupsafe ninja +uv pip install -v flash-attn --no-build-isolation +uv sync diff --git a/rag.sh b/rag.sh new file mode 100755 index 0000000..3b028a2 --- /dev/null +++ b/rag.sh @@ -0,0 +1,37 @@ +#!/bin/sh + +set -e +. ./.env +mkdir -p $LOCAL_DATA_DIR +mkdir -p $LOCAL_EMBEDDINGS_DIR +mkdir -p $LOCAL_MODEL_DIR +mkdir -p $LOCAL_OUTPUT_DIR +mkdir -p $LOCAL_EVALOUTPUT_DIR +. .venv/bin/activate +set -x + +BACKBONE_MODEL_NAME="Qwen2-VL-7B-Instruct" +RETRIEVAL_MODEL_TYPE="colpali" +RETRIEVAL_MODEL_NAME="colpaligemma-3b-pt-448-base" +RETRIEVAL_ADAPTER_MODEL_NAME="colpali-v1.2" +EMBEDDING_NAME="colpali-v1.2_m3-docvqa_dev" # from Step 1 Embedding +SPLIT="dev" +DATASET_NAME="m3-docvqa" +FAISS_INDEX_TYPE='ivfflat' +N_RETRIEVAL_PAGES=1 +INDEX_NAME="${EMBEDDING_NAME}_pageindex_$FAISS_INDEX_TYPE" # from Step 2 Indexing +OUTPUT_SAVE_NAME="${RETRIEVAL_ADAPTER_MODEL_NAME}_${BACKBONE_MODEL_NAME}_${DATASET_NAME}" # where to save RAG results +BITS=16 # BITS=4 for 4-bit qunaitzation in low memory GPUs +uv run examples/run_rag_m3docvqa.py \ + --use_retrieval \ + --retrieval_model_type=$RETRIEVAL_MODEL_TYPE \ + --load_embedding=True \ + --split=$SPLIT \ + --bits=$BITS \ + --n_retrieval_pages=$N_RETRIEVAL_PAGES \ + --data_name=$DATASET_NAME \ + --model_name_or_path=$BACKBONE_MODEL_NAME \ + --embedding_name=$EMBEDDING_NAME \ + --retrieval_model_name_or_path=$RETRIEVAL_MODEL_NAME \ + --retrieval_adapter_model_name_or_path=$RETRIEVAL_ADAPTER_MODEL_NAME \ + --output_dir=$LOCAL_EVALOUTPUT_DIR/$OUTPUT_SAVE_NAME