16 llama_present = any(i.find(
'<0xF0>') != -1
for i
in vocab.keys())
17 underscore_present = (len([1
for i
in vocab.keys()
if i.find(
'\u2581') != -1]) / len(vocab)) > 0.2
18 g_present = (len([1
for i
in vocab.keys()
if i.find(
'\u0120') != -1]) / len(vocab)) > 0.2
21 if underscore_present:
22 result.add(
"sentencepiece")
29 old_char_to_new_char = {}
32 if i ==
"sentencepiece":
33 old_char_to_new_char[
"\u2581".encode(
"UTF-8")] = b
" "
38 old_char_to_new_char[(
"<0x" + f
"{j:02x}".upper() +
">").encode(
"UTF-8")] = bytes([j])
40 raise ValueError(f
"{i} is not a valid processor name!")
42 regex = re.compile(b
"(%s)" % b
"|".join(sorted(list(map(re.escape, old_char_to_new_char.keys())), key=
lambda x: len(x), reverse=
True)))
47 new_vocab[new_k] = token_id