Formatron v0.4.2
Formatron empowers everyone to control the output format of language models with minimal overhead.
Loading...
Searching...
No Matches
_utils.py
Go to the documentation of this file.
1import re
2import typing
3from functools import lru_cache
4
5
6def _multiple_replace(replacements, regex, text):
7 # For each match, look-up corresponding value in dictionary
8 return regex.sub(lambda mo: replacements[mo.group()], text)
9
10
11Processors = set[typing.Literal["sentencepiece", "<0xHH>", "dot_G"]]
12
13
14def _autodetect_processors(vocab: typing.Dict[str, int]):
15 result = set()
16 llama_present = any(i.find('<0xF0>') != -1 for i in vocab.keys())
17 underscore_present = (len([1 for i in vocab.keys() if i.find('\u2581') != -1]) / len(vocab)) > 0.2
18 g_present = (len([1 for i in vocab.keys() if i.find('\u0120') != -1]) / len(vocab)) > 0.2
19 if llama_present:
20 result.add("<0xHH>")
21 if underscore_present:
22 result.add("sentencepiece")
23 elif g_present:
24 result.add("dot_G")
25 return result
26
27
28def get_original_characters(vocab: typing.Dict[str, int]) -> typing.Dict[bytes, int]:
29 old_char_to_new_char = {}
30 processors = _autodetect_processors(vocab)
31 for i in processors:
32 if i == "sentencepiece":
33 old_char_to_new_char["\u2581".encode("UTF-8")] = b" "
34 elif i == "dot_G":
35 old_char_to_new_char.update(huggingface_bytelevel_decoder())
36 elif i == "<0xHH>":
37 for j in range(256):
38 old_char_to_new_char[("<0x" + f"{j:02x}".upper() + ">").encode("UTF-8")] = bytes([j])
39 else:
40 raise ValueError(f"{i} is not a valid processor name!")
41 # Create a regular expression from the dictionary keys with longest keys first to avoid conflicts
42 regex = re.compile(b"(%s)" % b"|".join(sorted(list(map(re.escape, old_char_to_new_char.keys())), key=lambda x: len(x), reverse=True)))
43 new_vocab = {}
44 for k in vocab:
45 token_id = vocab[k]
46 new_k = _multiple_replace(old_char_to_new_char, regex, k.encode("UTF-8"))
47 new_vocab[new_k] = token_id
48 return new_vocab
49
50
51@lru_cache()
53 """
54 I hate legacy code.
55 """
56 bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
57 cs = bs[:]
58 n = 0
59 for b in range(2**8):
60 if b not in bs:
61 bs.append(b)
62 cs.append(2**8+n)
63 n += 1
64 cs = [chr(n).encode("UTF-8") for n in cs]
65 for i in range(len(bs)):
66 bs[i] = bytes([bs[i]])
67 return dict(zip(cs, bs))
typing.Dict[bytes, int] get_original_characters(typing.Dict[str, int] vocab)
Definition _utils.py:28
_autodetect_processors(typing.Dict[str, int] vocab)
Definition _utils.py:14
_multiple_replace(replacements, regex, text)
Definition _utils.py:6
huggingface_bytelevel_decoder()
I hate legacy code.
Definition _utils.py:55