Formatron v0.4.9
Formatron empowers everyone to control the output format of language models with minimal overhead.
Loading...
Searching...
No Matches
utils.py
Go to the documentation of this file.
1import re
2import typing
3from functools import lru_cache
4
5__all__ = ["get_original_characters", "update_vocab_0xHH", "update_vocab_sentencepiece", "update_vocab_dot_G"]
6
7def _multiple_replace(replacements: typing.Dict[bytes, bytes], regex: re.Pattern[bytes], text: bytes) -> bytes:
8 # For each match, look-up corresponding value in dictionary
9 return regex.sub(lambda mo: replacements[mo.group()], text)
10
11
12def get_original_characters(vocab: typing.Dict[str, int],
13 processors: typing.Optional[list[typing.Callable]] = None) -> typing.Dict[int, bytes]:
14 """
15 Get a vocabulary of original characters unmangled to raw UTF-8 bytes by the provided processors.
16
17 Args:
18 vocab: The mangled vocabulary.
19 processors: List of callables with signature (token_to_char: typing.Dict[bytes, bytes])->None.
20 Callables can be used to "unmangle" encoded characters to original characters. If None, processors will be auto-detected.
21 """
22 old_char_to_new_char = {}
23 assert len(set(vocab.values())) == len(vocab), "Vocabulary contains duplicate token IDs!"
24 if processors is None:
25 processors = autodetect_processors(vocab)
26 for update_vocab in processors:
27 update_vocab(old_char_to_new_char)
28 # Create a regular expression from the dictionary keys with longest keys first to avoid conflicts
29 regex = re.compile(b"(%s)" % b"|".join(sorted(list(map(re.escape, old_char_to_new_char.keys())), key=lambda x: len(x), reverse=True)))
30 new_vocab = {}
31 for k in vocab:
32 token_id = vocab[k]
33 new_k = _multiple_replace(old_char_to_new_char, regex, k.encode("UTF-8"))
34 new_vocab[token_id] = new_k
35 return new_vocab
36
37
38def autodetect_processors(vocab: typing.Dict[str, int]) -> typing.List[typing.Callable]:
39 """
40 Autodetect vocabulary processors.
41 """
42 result = []
43 llama_present = any(i.find('<0xF0>') != -1 for i in vocab.keys())
44 underscore_present = (len([1 for i in vocab.keys() if i.find('\u2581') != -1]) / len(vocab)) > 0.2
45 g_present = (len([1 for i in vocab.keys() if i.find('\u0120') != -1]) / len(vocab)) > 0.2
46 if llama_present:
47 result.append(update_vocab_0xHH)
48 if underscore_present:
49 result.append(update_vocab_sentencepiece)
50 elif g_present:
51 result.append(update_vocab_dot_G)
52 return result
53
54
55def update_vocab_0xHH(token_to_char: typing.Dict[bytes, bytes]):
56 """
57 Vocabulary processor for <0xHH> tokens (used in llama tokenizers)
58 """
59 for j in range(256):
60 token_to_char[("<0x" + f"{j:02x}".upper() + ">").encode("UTF-8")] = bytes([j])
61
62
63def update_vocab_sentencepiece(token_to_char: typing.Dict[bytes, bytes]):
64 """
65 Vocabulary processor for ▁ token (used in sentencepiece tokenizers)
66 """
67 token_to_char["\u2581".encode("UTF-8")] = b" "
68
69
70def update_vocab_dot_G(token_to_char: typing.Dict[bytes, bytes]):
71 """
72 Vocabulary processor for GPT2 style token mangling, like from \\n to Ġ(used in huggingface bytelevel preprocessors)
73 """
74 token_to_char.update(_huggingface_bytelevel_decoder())
75
76
77@lru_cache()
79 """
80 I hate legacy code.
81 """
82 bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
83 cs = bs[:]
84 n = 0
85 for b in range(2**8):
86 if b not in bs:
87 bs.append(b)
88 cs.append(2**8+n)
89 n += 1
90 cs = [chr(n).encode("UTF-8") for n in cs]
91 for i in range(len(bs)):
92 bs[i] = bytes([bs[i]])
93 return dict(zip(cs, bs))
update_vocab_0xHH(typing.Dict[bytes, bytes] token_to_char)
Vocabulary processor for <0xHH> tokens (used in llama tokenizers)
Definition utils.py:58
bytes _multiple_replace(typing.Dict[bytes, bytes] replacements, re.Pattern[bytes] regex, bytes text)
Definition utils.py:7
_huggingface_bytelevel_decoder()
I hate legacy code.
Definition utils.py:83
update_vocab_dot_G(typing.Dict[bytes, bytes] token_to_char)
Vocabulary processor for GPT2 style token mangling, like from \n to Ġ(used in huggingface bytelevel p...
Definition utils.py:73
typing.Dict[int, bytes] get_original_characters(typing.Dict[str, int] vocab, typing.Optional[list[typing.Callable]] processors=None)
Get a vocabulary of original characters unmangled to raw UTF-8 bytes by the provided processors.
Definition utils.py:21
update_vocab_sentencepiece(typing.Dict[bytes, bytes] token_to_char)
Vocabulary processor for ▁ token (used in sentencepiece tokenizers)
Definition utils.py:66
typing.List[typing.Callable] autodetect_processors(typing.Dict[str, int] vocab)
Autodetect vocabulary processors.
Definition utils.py:41