diff --git a/scripts/internvl2_tokenizer.py b/scripts/internvl2_tokenizer.py
index 171d6ad..090fa91 100644
--- a/scripts/internvl2_tokenizer.py
+++ b/scripts/internvl2_tokenizer.py
@@ -19,7 +19,7 @@ def encode(self, content):
return input_ids
def encode_vpm(self, content="Please describe the image shortly."):
- prompt = f"<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n
\n{content}<|im_end|><|im_start|>assistant\n"
+ prompt = f"<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n
" + "" * 64 + f"\n{content}<|im_end|><|im_start|>assistant\n"
input_ids = self.tokenizer.encode(prompt)
return input_ids
@@ -49,10 +49,10 @@ def eos_token(self):
print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id,
tokenizer.eos_token)
token_ids = tokenizer.encode_vpm()
-# [151644, 8948, 198, 56568, 104625, 100633, 104455, 104800, 101101, 32022, 102022, 99602, 100013, 9370, 90286, 21287, 42140, 53772, 35243, 26288, 104949, 3837, 105205, 109641, 67916, 30698, 11, 54851, 46944, 115404, 42192, 99441, 100623, 48692, 100168, 110498, 1773, 151645, 151644, 872, 198,
-# 151646,
-# 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648,
-# 151647,
+# [151644, 8948, 198, 56568, 104625, 100633, 104455, 104800, 101101, 32022, 102022, 99602, 100013, 9370, 90286, 21287, 42140, 53772, 35243, 26288, 104949, 3837, 105205, 109641, 67916, 30698, 11, 54851, 46944, 115404, 42192, 99441, 100623, 48692, 100168, 110498, 1773, 151645, 151644, 872, 198,
+# 151646,
+# 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648,
+# 151647,
# 198, 5501, 7512, 279, 2168, 19620, 13, 151645, 151644, 77091, 198]
# 118
print(token_ids)
diff --git a/scripts/internvl2_tokenizer_448.py b/scripts/internvl2_tokenizer_448.py
new file mode 100644
index 0000000..08984dd
--- /dev/null
+++ b/scripts/internvl2_tokenizer_448.py
@@ -0,0 +1,153 @@
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import json
+import argparse
+
+
+class Tokenizer_Http():
+
+ def __init__(self):
+
+ path = 'internvl2_tokenizer'
+ self.tokenizer = AutoTokenizer.from_pretrained(path,
+ trust_remote_code=True,
+ use_fast=False)
+
+ def encode(self, content):
+ prompt = f"<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n{content}<|im_end|><|im_start|>assistant\n"
+ input_ids = self.tokenizer.encode(prompt)
+ return input_ids
+
+ def encode_vpm(self, content="Please describe the image shortly."):
+ prompt = f"<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n
" + "" * 256 + f"\n{content}<|im_end|><|im_start|>assistant\n"
+ input_ids = self.tokenizer.encode(prompt)
+ return input_ids
+
+ def decode(self, token_ids):
+ return self.tokenizer.decode(token_ids,
+ clean_up_tokenization_spaces=False)
+
+ @property
+ def bos_id(self):
+ return self.tokenizer.bos_token_id
+
+ @property
+ def eos_id(self):
+ return self.tokenizer.eos_token_id
+
+ @property
+ def bos_token(self):
+ return self.tokenizer.bos_token
+
+ @property
+ def eos_token(self):
+ return self.tokenizer.eos_token
+
+
+tokenizer = Tokenizer_Http()
+
+print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id,
+ tokenizer.eos_token)
+token_ids = tokenizer.encode_vpm()
+# [151644, 8948, 198, 56568, 104625, 100633, 104455, 104800, 101101, 32022, 102022, 99602, 100013, 9370, 90286, 21287, 42140, 53772, 35243, 26288, 104949, 3837, 105205, 109641, 67916, 30698, 11, 54851, 46944, 115404, 42192, 99441, 100623, 48692, 100168, 110498, 1773, 151645, 151644, 872, 198,
+# 151646,
+# 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648,
+# 151647,
+# 198, 5501, 7512, 279, 2168, 19620, 13, 151645, 151644, 77091, 198]
+# 118
+print(token_ids)
+print(len(token_ids))
+token_ids = tokenizer.encode("hello world")
+# [151644, 8948, 198, 56568, 104625, 100633, 104455, 104800, 101101, 32022, 102022, 99602, 100013, 9370, 90286, 21287, 42140, 53772, 35243, 26288, 104949, 3837, 105205, 109641, 67916, 30698, 11, 54851, 46944, 115404, 42192, 99441, 100623, 48692, 100168, 110498, 1773, 151645, 151644, 872, 198, 14990, 1879, 151645, 151644, 77091, 198]
+# 47
+print(token_ids)
+print(len(token_ids))
+
+
+class Request(BaseHTTPRequestHandler):
+ #通过类继承,新定义类
+ timeout = 5
+ server_version = 'Apache'
+
+ def do_GET(self):
+ print(self.path)
+ #在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行)
+ self.send_response(200)
+ self.send_header("type", "get") #设置响应头,可省略或设置多个
+ self.end_headers()
+
+ if self.path == '/bos_id':
+ bos_id = tokenizer.bos_id
+ # print(bos_id)
+ # to json
+ if bos_id is None:
+ msg = json.dumps({'bos_id': -1})
+ else:
+ msg = json.dumps({'bos_id': bos_id})
+ elif self.path == '/eos_id':
+ eos_id = tokenizer.eos_id
+ if eos_id is None:
+ msg = json.dumps({'eos_id': -1})
+ else:
+ msg = json.dumps({'eos_id': eos_id})
+ else:
+ msg = 'error'
+
+ print(msg)
+ msg = str(msg).encode() #转为str再转为byte格式
+
+ self.wfile.write(msg) #将byte格式的信息返回给客户端
+
+ def do_POST(self):
+ #在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行)
+ data = self.rfile.read(int(
+ self.headers['content-length'])) #获取从客户端传入的参数(byte格式)
+ data = data.decode() #将byte格式转为str格式
+
+ self.send_response(200)
+ self.send_header("type", "post") #设置响应头,可省略或设置多个
+ self.end_headers()
+
+ if self.path == '/encode':
+ req = json.loads(data)
+ print(req)
+ prompt = req['text']
+ b_img_prompt = False
+ if 'img_prompt' in req:
+ b_img_prompt = req['img_prompt']
+ if b_img_prompt:
+ token_ids = tokenizer.encode_vpm(prompt)
+ else:
+ token_ids = tokenizer.encode(prompt)
+ if token_ids is None:
+ msg = json.dumps({'token_ids': -1})
+ else:
+ msg = json.dumps({'token_ids': token_ids})
+
+ elif self.path == '/decode':
+ req = json.loads(data)
+ token_ids = req['token_ids']
+ text = tokenizer.decode(token_ids)
+ if text is None:
+ msg = json.dumps({'text': ""})
+ else:
+ msg = json.dumps({'text': text})
+ else:
+ msg = 'error'
+ print(msg)
+ msg = str(msg).encode() #转为str再转为byte格式
+
+ self.wfile.write(msg) #将byte格式的信息返回给客户端
+
+
+if __name__ == "__main__":
+
+ args = argparse.ArgumentParser()
+ args.add_argument('--host', type=str, default='localhost')
+ args.add_argument('--port', type=int, default=8080)
+ args = args.parse_args()
+
+ host = (args.host, args.port) #设定地址与端口号,'localhost'等价于'127.0.0.1'
+ print('http://%s:%s' % host)
+ server = HTTPServer(host, Request) #根据地址端口号和新定义的类,创建服务器实例
+ server.serve_forever() #开启服务