forked from victorlin12345/Chinese2English_Seq2Seq_Attention
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_loader.py
130 lines (100 loc) · 3.96 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from __future__ import absolute_import, division, print_function, unicode_literals
import re
import os
import io
import jieba
import tensorflow as tf
import logging
logger = tf.get_logger()
logger.setLevel(logging.FATAL)
jieba.setLogLevel(logging.INFO)
def preprocess_eng_sentence(w):
w = w.lower().strip()
# replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
w = re.sub(r"[^a-zA-Z?'.!,¿]+", " ", w)
w = w.rstrip().strip()
# adding a start and an end token to the sentence
# so that the model know when to start and stop predicting.
w = '<start> ' + w + ' <end>'
return w
def preprocess_ch_sentence(w):
w = w.rstrip().strip()
# adding a start and an end token to the sentence
# so that the model know when to start and stop predicting.
w = '<start> ' + w + ' <end>'
return w
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, CHINESE]
def create_dataset(path, num_examples):
word_pairs = []
lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
for line in lines[:num_examples]:
eng, ch = line.strip().split("\t")
word_pairs.append([preprocess_eng_sentence(eng), preprocess_ch_sentence(ch)])
return zip(*word_pairs)
# Transform the word to token.
def tokenize(lang):
# Tokenizer: notice word token pairs always same for same corpus
# and token start from 1, 0 will be keep
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
lang_tokenizer.fit_on_texts(lang)
tensor = lang_tokenizer.texts_to_sequences(lang)
# padding: 'post': pad after each sequence.
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
return tensor, lang_tokenizer
# Output either input or target token seqence and their Tokenizer
def load_dataset(path, num_examples=None):
# creating cleaned input, output pairs
targ_lang, inp_lang = create_dataset(path, num_examples)
input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer
# Print word sequence convert from token sequence
def convert(lang, tensor):
for t in tensor:
if t!=0:
print ("%d ----> %s" % (t, lang.index_word[t]))
# Prepare word2index file for test or translate
def save_index2word(lang, file_name):
file = open(file_name, "w")
for index, word in lang.index_word.items():
file.write(str(index)+"\t"+word+"\n")
file.close()
def load_word2index(file_name):
word2index = {}
for line in open(file_name, "r"):
index, word = line.strip().split("\t")
word2index[word] = int(index)
return word2index
def load_index2word(file_name):
index2word = {}
for line in open(file_name, "r"):
index, word = line.strip().split("\t")
index2word[int(index)] = word
return index2word
# Get the max length of sentence in the list
def save_max_length(inp_tensor, targ_tensor, vocab_inp_size, vocab_targ_size, file_name):
file = open(file_name, "w")
inp_len = max(len(t) for t in inp_tensor)
targ_len = max(len(t) for t in targ_tensor)
file.write("INPUT\t"+str(inp_len)+"\n")
file.write("INPUT_VOCAB\t"+str(vocab_inp_size)+"\n")
file.write("TARGET\t"+str(targ_len)+"\n")
file.write("TARGET_VOCAB\t"+str(vocab_targ_size)+"\n")
file.close()
def load_max_length(file_name):
MAX_LEN = {}
for line in open(file_name, "r"):
k, v = line.strip().split("\t")
MAX_LEN[k] = int(v)
return MAX_LEN["INPUT"], MAX_LEN["INPUT_VOCAB"], MAX_LEN["TARGET"], MAX_LEN["TARGET_VOCAB"]
def init_jieba_dict():
jieba.load_userdict("dataset/jieba_dict.txt")
# seg the chinese sentence
def seg2words(original_sentense):
words = jieba.cut(original_sentense, cut_all=False)
return_word=''
for w in words:
return_word = return_word+' '+w
return return_word.lstrip()