-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrain.sh
executable file
·73 lines (60 loc) · 2.95 KB
/
train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#! /usr/bin/env bash
set -e # exit on error
set -u # exit on using unset variable
set -o pipefail # exit on error in pipe
CURRENT_DIR=$(pwd -L)
#COLLECTION_NAME=$1
TARGET_WORD=bat
DIM=4
SAMPLES=4000
NEIGHBORS=20
LMODEL=mxbai-large
. venv/bin/activate || . .venv/bin/activate || true
## Convert dataset into readable format
## Split data into sentences
#python -m mini_coil.data_pipeline.split_sentences \
# --input-file "${CURRENT_DIR}/data/openwebtext-1920-sentences-"${TARGET_WORD}".txt.gz" \
# --output-file "${CURRENT_DIR}/data/openwebtext-sentences/openwebtext-1920-splitted-"${TARGET_WORD}".txt.gz"
## Encode sentences with transformer model
#python -m mini_coil.data_pipeline.encode_targets \
# --input-file "${CURRENT_DIR}/data/openwebtext-1920-sentences-"${TARGET_WORD}".txt.gz" \
# --output-file "${CURRENT_DIR}/data/output/openwebtext-1920-splitted-"${TARGET_WORD}"-encodings"
## Upload encoded sentences to Qdrant
#python -m mini_coil.data_pipeline.upload_to_qdrant \
# --input-emb
# --input-text
# --collection-name ${COLLECTION_NAME}
# Sample sentences with specified words and apply dimensionality reduction
python -m mini_coil.data_pipeline.compress_dimentions \
--output-dir data/umap-"${SAMPLES}"-"${NEIGHBORS}"-"${DIM}"d-"${LMODEL}" \
--sample-size "${SAMPLES}" --dim "${DIM}" --word "${TARGET_WORD}" --overwrite \
--limit ${NEIGHBORS} --n_neighbours ${NEIGHBORS}
echo "Compressed dimentions"
## Download sampled sentences
python -m mini_coil.data_pipeline.load_sentences \
--word "${TARGET_WORD}" \
--matrix-dir data/umap-"${SAMPLES}"-"${NEIGHBORS}"-"${DIM}"d-"${LMODEL}" \
--output-dir data/umap-"${SAMPLES}"-"${NEIGHBORS}"-"${DIM}"d-"${LMODEL}"-sentences
echo "Loaded sentences"
## Encode sentences with smaller transformer model
python -m mini_coil.data_pipeline.encode_and_filter \
--sentences-file data/umap-"${SAMPLES}"-"${NEIGHBORS}"-"${DIM}"d-"${LMODEL}"-sentences/sentences-"${TARGET_WORD}".jsonl \
--output-file data/umap-"${SAMPLES}"-"${NEIGHBORS}"-"${DIM}"d-"${LMODEL}"-input/word-emb-"${TARGET_WORD}".npy \
--word "${TARGET_WORD}" \
--sample-size "${SAMPLES}"
echo "Encoded sentences"
#Train encoder **for each word**
python -m mini_coil.training.train_word \
--embedding-path data/umap-"${SAMPLES}"-"${NEIGHBORS}"-"${DIM}"d-"${LMODEL}"-input/word-emb-"${TARGET_WORD}".npy \
--target-path data/umap-"${SAMPLES}"-"${NEIGHBORS}"-"${DIM}"d-"${LMODEL}"/compressed_matrix_"${TARGET_WORD}".npy \
--log-dir data/train_logs/log_"${TARGET_WORD}" \
--output-path data/umap-"${SAMPLES}"-"${NEIGHBORS}"-"${DIM}"d-"${LMODEL}"-models/model-"${TARGET_WORD}".ptch \
--epochs 500
## --gpu
echo "Combined models"
## Merge encoders for each word into a single model
python -m mini_coil.data_pipeline.combine_models \
--models-dir "${CURRENT_DIR}/data/umap-${SAMPLES}-${NEIGHBORS}-${DIM}d-${LMODEL}-models" \
--vocab-path "${CURRENT_DIR}/data/30k-vocab-filtered.txt" \
--output-path "data/model_${SAMPLES}_${DIM}d" \
--output-dim "${DIM}"