Formatron v0.4.2
Formatron empowers everyone to control the output format of language models with minimal overhead.
Loading...
Searching...
No Matches
extractor.py
Go to the documentation of this file.
1"""
2Extractors for extracting data from generated strings.
3"""
4import abc
5import typing
6
7from general_sam import GeneralSam
8__all__ = ["Extractor", "ChoiceExtractor", "NonterminalExtractor"]
9
10class Extractor(abc.ABC):
11 """
12 An abstract extractor that extracts data from a string and offers its KBNF rules definition.
13 """
15 def __init__(self, capture_name: typing.Optional[str] = None):
16 """
17 Initialize an extractor.
18 Args:
19 capture_name: The name of the capture, or `None` if the extractor does not capture.
20 """
21 self._capture_name = capture_name
23 @property
24 def capture_name(self) -> typing.Optional[str]:
25 """
26 Get the name of the capture, or `None` if the extractor does not capture.
27 """
28 return self._capture_name
29
30 @abc.abstractmethod
31 def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]:
32 """
33 Extract data from the input string, or `None` if the extraction failed.
34 Args:
35 input_str: The input string.
36 Returns:
37 The remaining string and the extracted data, or `None` if the extraction failed.
38 """
39
40 @property
41 @abc.abstractmethod
42 def kbnf_reference(self) -> str:
43 """
44 Get the KBNF reference of the extractor in the generated grammar of a Formatter.
45 Check Formatter.kbnf_definition for the difference between kbnf_reference and kbnf_definition.
46 """
47
48 def __str__(self):
49 return f"${{{self.kbnf_reference}}}"
50
51 @property
52 @abc.abstractmethod
53 def kbnf_definition(self) -> str:
54 """
55 Get the KBNF definition of the extractor in the generated grammar of a Formatter.
56
57 The difference between kbnf_reference and kbnf_definition is that kbnf_reference is how the extractor is referenced in other rules,
58 while kbnf_definition is the definition of the extractor itself, similar to a C header file. If kbnf_reference does not need extra definition,
59 you can just return an empty string.
60 """
61
62
63class NonterminalExtractor(Extractor):
64 """
65 An extractor that extracts data corresponding to a nonterminal.
66 """
68 def __init__(self, nonterminal: str, capture_name: typing.Optional[str] = None):
69 """
70 Initialize the nonterminal extractor.
71 """
72 super().__init__(capture_name)
73 if capture_name is None:
74 self._nonterminal = nonterminal
75 else:
76 self._nonterminal = f"{nonterminal}_{capture_name}"
77
78 @property
79 def nonterminal(self) -> str:
80 """
81 Get the nonterminal of the extractor.
82 """
83 return self._nonterminal
84
85 @property
86 def kbnf_reference(self) -> str:
87 return self._nonterminal
88
89
91 """
92 An extractor that extracts a literal string.
93 """
94
95 def __init__(self, literal: str):
96 """
97 Initialize the literal extractor. It never captures since capturing a literal is redundant.
99 Args:
100 literal: The literal string to extract.
101 """
102 super().__init__(None)
103 self._literal = literal
104
105 def extract(self, input_str: str) -> typing.Optional[tuple[str, str]]:
106 """
107 Extract the literal from the input string, or `None` if the literal is not found.
108 """
109 pos = input_str.find(self._literal)
110 if pos == -1:
111 return None
112 return input_str[pos + len(self._literal):], self._literal
113
114 @property
115 def kbnf_reference(self) -> str:
116 return repr(self._literal)
117
118 @property
119 def kbnf_definition(self) -> str:
120 return ""
124 """
125 An extractor that uses multiple extractors to extract data. It stops at the first succeeding extractor.
126 """
127
128 def __init__(self, choices: typing.Iterable[Extractor], capture_name: str, nonterminal: str):
129 """
130 Initialize the choice extractor.
131
132 Args:
133 choices: The extractors to choose from. The order determines the extractors' priority.
134 capture_name: The name of the capture, or `None` if the extractor does not capture.
135 nonterminal: The nonterminal representing the extractor.
136 """
137 super().__init__(nonterminal, capture_name)
138 self._choices = choices
139
140 def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]:
141 """
142 Extract data from the input string using the first succeeding extractor.
144 Args:
145 input_str: The input string.
146 Returns:
147 The remaining string and the extracted data, or `None` if all extractors failed.
148 """
149 for choice in self._choices:
150 matched = choice.extract(input_str)
151 if matched:
152 return matched
153 return None
154
155 @property
156 def kbnf_definition(self) -> str:
157 return f"{self.nonterminal} ::= {' | '.join([i.kbnf_reference for i in self._choices])};"
159
161 """
162 An extractor that extracts a substring of a given string from the input string.
163 """
164
165 def __init__(self, string: str, capture_name: str, nonterminal: str, *, extract_empty_substring: bool = False):
166 """
167 Initialize the substring extractor.
168 Args:
169 string: The string to extract.
170 capture_name: The name of the capture, or `None` if the extractor does not capture.
171 extract_empty_substring: Whether to extract empty substring as a valid substring.
172 """
173 super().__init__(nonterminal, capture_name)
174 self._suffix_automaton = GeneralSam.from_bytes(string.encode("UTF-8"))
175 self._string = string
176 self.extract_empty_substring = extract_empty_substring
177
178 def extract(self, input_str: str) -> typing.Optional[tuple[str, str]]:
179 """
180 Extract the longest substring of a given string from the input string.
181 If extract_empty_substring is True, empty string is always a valid substring, so the returned string could be empty and `None` will never be returned.
182 Otherwise, empty string is not a valid substring,
183 so the returned string could not be empty and `None` will be returned if the input string does not contain the given string.
184 """
185 current_state = self._suffix_automaton.get_root_state()
186 longest_match = 0
187 for char in input_str:
188 current_state.feed_bytes(char.encode('utf-8'))
189 if current_state.is_nil():
190 break
191 longest_match += 1
192
193 if longest_match > 0 or self.extract_empty_substring:
194 extracted = input_str[:longest_match]
195 remaining = input_str[longest_match:]
196 return remaining, extracted
197 return None
198
199 @property
200 def kbnf_definition(self) -> str:
201 return f"{self.nonterminal} ::= #substrs{repr(self._string)};"
An extractor that uses multiple extractors to extract data.
Definition extractor.py:194
An abstract extractor that extracts data from a string and offers its KBNF rules definition.
Definition extractor.py:14
__init__(self, typing.Optional[str] capture_name=None)
Initialize an extractor.
Definition extractor.py:21
typing.Optional[tuple[str, typing.Any]] extract(self, str input_str)
Extract data from the input string, or None if the extraction failed.
Definition extractor.py:48
typing.Optional[str] capture_name(self)
Get the name of the capture, or None if the extractor does not capture.
Definition extractor.py:35
An extractor that extracts a literal string.
Definition extractor.py:143
__init__(self, str literal)
Initialize the literal extractor.
Definition extractor.py:151
typing.Optional[tuple[str, str]] extract(self, str input_str)
Extract the literal from the input string, or None if the literal is not found.
Definition extractor.py:158
An extractor that extracts data corresponding to a nonterminal.
Definition extractor.py:98
__init__(self, str nonterminal, typing.Optional[str] capture_name=None)
Initialize the nonterminal extractor.
Definition extractor.py:103
An extractor that extracts a substring of a given string from the input string.
Definition extractor.py:240