added (for now separated) scripts for steps

This commit is contained in:
Alessandro Clerici
2025-07-09 15:41:46 +00:00
parent ab46556a0a
commit 9fbd757a18
4 changed files with 107 additions and 0 deletions

27
embed.sh Executable file
View File

@ -0,0 +1,27 @@
#!/bin/sh
set -e
. ./.env
mkdir -p $LOCAL_DATA_DIR
mkdir -p $LOCAL_EMBEDDINGS_DIR
mkdir -p $LOCAL_MODEL_DIR
mkdir -p $LOCAL_OUTPUT_DIR
. .venv/bin/activate
set -x
DATASET_NAME="m3-docvqa"
RETRIEVAL_MODEL_TYPE="colpali"
RETRIEVAL_MODEL_NAME="colpaligemma-3b-pt-448-base"
RETRIEVAL_ADAPTER_MODEL_NAME="colpali-v1.2"
SPLIT="dev"
EMBEDDING_NAME=$RETRIEVAL_ADAPTER_MODEL_NAME"_"$DATASET_NAME"_"$SPLIT # where to save embeddings
# uv tool run accelerate launch --num_processes=1 --mixed_precision=bf16 examples/run_page_embedding.py \
accelerate launch --num_processes=1 --mixed_precision=bf16 examples/run_page_embedding.py \
--use_retrieval \
--retrieval_model_type=$RETRIEVAL_MODEL_TYPE \
--data_name=$DATASET_NAME \
--split=$SPLIT \
--loop_unique_doc_ids=True \
--output_dir=$LOCAL_EMBEDDINGS_DIR/$EMBEDDING_NAME \
--retrieval_model_name_or_path=$RETRIEVAL_MODEL_NAME \
--retrieval_adapter_model_name_or_path=$RETRIEVAL_ADAPTER_MODEL_NAME