15 def __init__(self, capture_name: typing.Optional[str] =
None):
17 Initialize an extractor.
19 capture_name: The name of the capture, or `None` if the extractor does not capture.
26 Get the name of the capture, or `None` if the extractor does not capture.
31 def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]:
33 Extract data from the input string, or `None` if the extraction failed.
35 input_str: The input string.
37 The remaining string and the extracted data, or `None` if the extraction failed.
44 Get the KBNF reference of the extractor in the generated grammar of a Formatter.
45 Check Formatter.kbnf_definition for the difference between kbnf_reference and kbnf_definition.
49 return f
"${{{self.kbnf_reference}}}"
55 Get the KBNF definition of the extractor in the generated grammar of a Formatter.
57 The difference between kbnf_reference and kbnf_definition is that kbnf_reference is how the extractor is referenced in other rules,
58 while kbnf_definition is the definition of the extractor itself, similar to a C header file. If kbnf_reference does not need extra definition,
59 you can just return an empty string.
63class NonterminalExtractor(Extractor):
65 An extractor that extracts data corresponding to a nonterminal.
68 def __init__(self, nonterminal: str, capture_name: typing.Optional[str] =
None):
70 Initialize the nonterminal extractor.
73 if capture_name
is None:
74 self._nonterminal = nonterminal
76 self._nonterminal = f
"{nonterminal}_{capture_name}"
79 def nonterminal(self) -> str:
81 Get the nonterminal of the extractor.
83 return self._nonterminal
87 return self._nonterminal
92 An extractor that extracts a literal string.
100 literal: The literal string to extract.
103 self._literal = literal
105 def extract(self, input_str: str) -> typing.Optional[tuple[str, str]]:
107 Extract the literal from the input string, or `None` if the literal is not found.
109 pos = input_str.find(self._literal)
112 return input_str[pos + len(self._literal):], self._literal
116 return repr(self._literal)
125 An extractor that uses multiple extractors to extract data. It stops at the first succeeding extractor.
128 def __init__(self, choices: typing.Iterable[Extractor], capture_name: str, nonterminal: str):
130 Initialize the choice extractor.
133 choices: The extractors to choose from. The order determines the extractors' priority.
134 capture_name: The name of the capture, or `None` if the extractor does not capture.
135 nonterminal: The nonterminal representing the extractor.
137 super().
__init__(nonterminal, capture_name)
138 self._choices = choices
145 input_str: The input string.
147 The remaining string and the extracted data, or `None` if all extractors failed.
149 for choice
in self._choices:
150 matched = choice.extract(input_str)
157 return f
"{self.nonterminal} ::= {' | '.join([i.kbnf_reference for i in self._choices])};"
162 An extractor that extracts a substring of a given string from the input string.
165 def __init__(self, string: str, capture_name: str, nonterminal: str, *, extract_empty_substring: bool =
False):
167 Initialize the substring extractor.
169 string: The string to extract.
170 nonterminal: The nonterminal representing the extractor.
171 capture_name: The name of the capture, or `None` if the extractor does not capture.
172 extract_empty_substring: Whether to extract empty substring as a valid substring.
174 super().
__init__(nonterminal, capture_name)
175 self._suffix_automaton = GeneralSam.from_bytes(string.encode(
"UTF-8"))
176 self._string = string
177 self.extract_empty_substring = extract_empty_substring
179 def extract(self, input_str: str) -> typing.Optional[tuple[str, str]]:
181 Extract the longest substring of a given string from the input string.
182 If extract_empty_substring is True, empty string is always a valid substring, so the returned string could be empty and `None` will never be returned.
183 Otherwise, empty string is not a valid substring,
184 so the returned string could not be empty and `None` will be returned if the input string does not contain the given string.
186 current_state = self._suffix_automaton.get_root_state()
188 for char
in input_str:
189 current_state.feed_bytes(char.encode(
'utf-8'))
190 if current_state.is_nil():