ziadrone commited on
Commit
16068c3
·
verified ·
1 Parent(s): 30ee01d

push fresh rebuilt model + files

Browse files
Files changed (1) hide show
  1. tokenization_shivik_m1.py +44 -0
tokenization_shivik_m1.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ from transformers import PreTrainedTokenizer
4
+
5
+ class ShivikM1Tokenizer(PreTrainedTokenizer):
6
+ vocab_files_names = {
7
+ "vocab_file": "vocab.json",
8
+ "merges_file": "merges.txt"
9
+ }
10
+
11
+ def __init__(self, vocab_file=None, merges_file=None, **kwargs):
12
+ super().__init__(**kwargs)
13
+
14
+ if vocab_file is None:
15
+ raise ValueError("vocab_file must be provided.")
16
+
17
+ with open(vocab_file, "r", encoding="utf-8") as f:
18
+ self.encoder = json.load(f)
19
+
20
+ self.decoder = {v: k for k, v in self.encoder.items()}
21
+ self.vocab_file = vocab_file
22
+ self.merges_file = merges_file
23
+
24
+ def get_vocab(self):
25
+ return dict(self.encoder)
26
+
27
+ @property
28
+ def vocab_size(self):
29
+ return len(self.encoder)
30
+
31
+ def _tokenize(self, text):
32
+ return text.split()
33
+
34
+ def _convert_token_to_id(self, token):
35
+ return self.encoder.get(token, self.encoder.get("<unk>", 0))
36
+
37
+ def _convert_id_to_token(self, idx):
38
+ return self.decoder.get(idx, "<unk>")
39
+
40
+ def convert_tokens_to_string(self, tokens):
41
+ return " ".join(tokens)
42
+
43
+ def build_inputs_with_special_tokens(self, token_ids):
44
+ return token_ids