forked from AI4Bharat/IndicTrans2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_data_joint_training.sh
165 lines (123 loc) · 7.2 KB
/
prepare_data_joint_training.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#/bin/bash
# This script preprocesses and binarizes the data for training translation models using fairseq.
# Only difference between this script and `prepare_data_joint_finetuning.sh` that we generate
# fairseq dict using this script that is commonly shared across for training all the models further.
echo `date`
exp_dir=$1 # path to the experiment directory
vocab_dir=${2:-"$exp_dir/vocab"} # path to the spm-based tokenizer directory
train_data_dir=${3:-"$exp_dir/train"} # path to the train data within experiment directory
devtest_data_dir=${4:-"$exp_dir/devtest/all"} # path to the devtest data within experiment directory
root=$(dirname $0)
echo "Running experiment ${exp_dir}"
train_processed_dir=$exp_dir/data
devtest_processed_dir=$exp_dir/data
out_data_dir=$exp_dir/final_bin
mkdir -p $train_processed_dir
mkdir -p $devtest_processed_dir
mkdir -p $out_data_dir
# get a list of language pairs in the `train_data_dir`
pairs=$(ls -d $train_data_dir/* | sort)
# iterate over each language pair
for pair in ${pairs[@]}; do
# extract the source and target languages from the pair name
pair=$(basename $pair)
src_lang=$(echo "$pair" | cut -d "-" -f 1)
tgt_lang=$(echo "$pair" | cut -d "-" -f 2)
echo "$src_lang - $tgt_lang"
train_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang
devtest_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang
mkdir -p $train_norm_dir
mkdir -p $devtest_norm_dir
# check if the source language text requires transliteration
src_transliterate="true"
if [[ $src_lang == *"Arab"* ]] || [[ $src_lang == *"Olck"* ]] || \
[[ $src_lang == *"Mtei"* ]] || [[ $src_lang == *"Latn"* ]]; then
src_transliterate="false"
fi
# check if the target language text requires transliteration
tgt_transliterate="true"
if [[ $tgt_lang == *"Arab"* ]] || [[ $tgt_lang == *"Olck"* ]] || \
[[ $tgt_lang == *"Mtei"* ]] || [[ $tgt_lang == *"Latn"* ]]; then
tgt_transliterate="false"
fi
# --------------------------------------------------------------------------
# train preprocessing
# --------------------------------------------------------------------------
train_infname_src=$train_data_dir/${src_lang}-${tgt_lang}/train.$src_lang
train_infname_tgt=$train_data_dir/${src_lang}-${tgt_lang}/train.$tgt_lang
train_outfname_src=$train_norm_dir/train.$src_lang
train_outfname_tgt=$train_norm_dir/train.$tgt_lang
echo "Normalizing punctuations for train"
parallel --pipe --keep-order bash $root/normalize_punctuation.sh $src_lang < $train_infname_src > $train_outfname_src._norm
parallel --pipe --keep-order bash $root/normalize_punctuation.sh $tgt_lang < $train_infname_tgt > $train_outfname_tgt._norm
# add do not translate tags to handle special failure cases
echo "Applying do not translate tags for train"
python3 scripts/normalize_regex.py $train_outfname_src._norm $train_outfname_tgt._norm $train_outfname_src.norm $train_outfname_tgt.norm
echo "Applying normalization and script conversion for train"
# this script preprocesses the text and for indic languages, converts script to devanagari if needed
input_size=`python3 scripts/preprocess_translate.py $train_outfname_src.norm $train_outfname_src $src_lang $src_transliterate false`
input_size=`python3 scripts/preprocess_translate.py $train_outfname_tgt.norm $train_outfname_tgt $tgt_lang $tgt_transliterate true`
echo "Number of sentences in train: $input_size"
# --------------------------------------------------------------------------
# dev preprocessing
# --------------------------------------------------------------------------
dev_infname_src=$devtest_data_dir/${src_lang}-${tgt_lang}/dev.$src_lang
dev_infname_tgt=$devtest_data_dir/${src_lang}-${tgt_lang}/dev.$tgt_lang
dev_outfname_src=$devtest_norm_dir/dev.$src_lang
dev_outfname_tgt=$devtest_norm_dir/dev.$tgt_lang
echo "Normalizing punctuations for dev"
parallel --pipe --keep-order bash normalize_punctuation.sh $src_lang < $dev_infname_src > $dev_outfname_src._norm
parallel --pipe --keep-order bash normalize_punctuation.sh $tgt_lang < $dev_infname_tgt > $dev_outfname_tgt._norm
# add do not translate tags to handle special failure cases
echo "Applying do not translate tags for dev"
python3 scripts/normalize_regex.py $dev_outfname_src._norm $dev_outfname_tgt._norm $dev_outfname_src.norm $dev_outfname_tgt.norm
echo "Applying normalization and script conversion for dev"
# this script preprocesses the text and for indic languages, converts script to devanagari if needed
input_size=`python scripts/preprocess_translate.py $dev_outfname_src.norm $dev_outfname_src $src_lang $src_transliterate false`
input_size=`python scripts/preprocess_translate.py $dev_outfname_tgt.norm $dev_outfname_tgt $tgt_lang $tgt_transliterate true`
echo "Number of sentences in dev: $input_size"
done
# this concatenates lang pair data and creates text files to keep track of number of
# lines in each lang pair. this is important for joint training, as we will merge all
# the lang pairs and the indivitual lang lines info would be required for adding specific
# lang tags later.
# the outputs of these scripts will be text file like this:
# <lang1> <lang2> <number of lines>
# lang1-lang2 n1
# lang1-lang3 n2
python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data 'train'
python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data 'dev'
# tokenization of train and dev set using the spm trained models
mkdir -p $exp_dir/bpe
splits=(train dev)
for split in ${splits[@]}; do
echo "Applying sentence piece for $split"
bash apply_sentence_piece.sh $exp_dir $exp_dir/data $exp_dir/bpe SRC TGT $split
done
# this is only required for joint training
# we apply language tags to the bpe segmented data
# if we are translating lang1 to lang2 then <lang1 line> will become <lang1> <lang2> <lang1 line>
mkdir -p $exp_dir/final
echo "Adding language tags"
python scripts/add_joint_tags_translate.py $exp_dir 'train'
python scripts/add_joint_tags_translate.py $exp_dir 'dev'
# this is important step if you are training with tpu and using num_batch_buckets
# the currnet implementation does not remove outliers before bucketing and hence
# removing these large sentences ourselves helps with getting better buckets
# python scripts/remove_large_sentences.py $exp_dir/bpe/train.SRC $exp_dir/bpe/train.TGT $exp_dir/final/train.SRC $exp_dir/final/train.TGT
# python scripts/remove_large_sentences.py $exp_dir/bpe/dev.SRC $exp_dir/bpe/dev.TGT $exp_dir/final/dev.SRC $exp_dir/final/dev.TGT
# python scripts/remove_large_sentences.py $exp_dir/bpe/test.SRC $exp_dir/bpe/test.TGT $exp_dir/final/test.SRC $exp_dir/final/test.TGT
echo "Binarizing data"
# use cpu_count to get num_workers instead of setting it manually when running
# in different instances
num_workers=`python -c "import multiprocessing; print(multiprocessing.cpu_count())"`
data_dir=$exp_dir/final
out_data_dir=$exp_dir/final_bin
rm -rf $out_data_dir
fairseq-preprocess \
--source-lang SRC --target-lang TGT \
--trainpref $data_dir/train \
--validpref $data_dir/dev \
--destdir $out_data_dir \
--workers $num_workers \
--thresholdtgt 5