Parsing In Python: Tools And Libraries より。
- 静的なパーサジェネレータ
- 生成されたパーサは標準ライブラリ以外の依存を持たない
のが最初のチャームポイント。一つ目は性能が気になる場合はオイシイかもしらんし、二つ目が特に重要なケースも多いだろう。
やってみたら生成されたパーサが「保守に耐え、読みやすい」というほどのもんではなく、自動生成らしいな、て感じだったが…本題のへろわるどに入る前に。Canopy は node.js で作られている。持ってないなら先にインストールしとくこと。node.js を持ってれば、Canopy は node.js のパッケージマネージャ npm でインストール出来る:
1 [me@host: ~]$ npm install --global canopy
(一応。--global
を付けると Windows 7 の場合は C:/Users/<you>/AppData/Roaming/npm/
に、付けなければカレントディレクトリにインストールされる。)
で、普通の PEG を記述して普通にやるんでは「おぉ、Canopy」感薄いんで、Canopy らしさ、のもう一つのチャームポイント:
- 構文定義とアクションのマッピングを同時に書ける
でへるわろでてみる:
1 grammar Maps
2 map <- "{" string ":" value "}" %make_map
3 string <- "'" [^']* "'" %make_string
4 value <- list / number
5 list <- "[" value ("," value)* "]" %make_list
6 number <- [0-9]+ %make_number
パーセント記号部分が「アクション名」。で、このファイル「maps.peg」 を入力として、「ターゲット言語 Python」を指定して起動:
1 [me@host: ~]$ canopy maps.peg --lang python
これによりこんな「maps.py
」が作られた:
1 from collections import defaultdict
2 import re
3
4
5 class TreeNode(object):
6 def __init__(self, text, offset, elements=None):
7 self.text = text
8 self.offset = offset
9 self.elements = elements or []
10
11 def __iter__(self):
12 for el in self.elements:
13 yield el
14
15
16 class TreeNode1(TreeNode):
17 def __init__(self, text, offset, elements):
18 super(TreeNode1, self).__init__(text, offset, elements)
19 self.string = elements[1]
20 self.value = elements[3]
21
22
23 class TreeNode2(TreeNode):
24 def __init__(self, text, offset, elements):
25 super(TreeNode2, self).__init__(text, offset, elements)
26 self.value = elements[1]
27
28
29 class TreeNode3(TreeNode):
30 def __init__(self, text, offset, elements):
31 super(TreeNode3, self).__init__(text, offset, elements)
32 self.value = elements[1]
33
34
35 class ParseError(SyntaxError):
36 pass
37
38
39 FAILURE = object()
40
41
42 class Grammar(object):
43 REGEX_1 = re.compile('^[^\']')
44 REGEX_2 = re.compile('^[0-9]')
45
46 def _read_map(self):
47 address0, index0 = FAILURE, self._offset
48 cached = self._cache['map'].get(index0)
49 if cached:
50 self._offset = cached[1]
51 return cached[0]
52 index1, elements0 = self._offset, []
53 address1 = FAILURE
54 chunk0 = None
55 if self._offset < self._input_size:
56 chunk0 = self._input[self._offset:self._offset + 1]
57 if chunk0 == '{':
58 address1 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
59 self._offset = self._offset + 1
60 else:
61 address1 = FAILURE
62 if self._offset > self._failure:
63 self._failure = self._offset
64 self._expected = []
65 if self._offset == self._failure:
66 self._expected.append('"{"')
67 if address1 is not FAILURE:
68 elements0.append(address1)
69 address2 = FAILURE
70 address2 = self._read_string()
71 if address2 is not FAILURE:
72 elements0.append(address2)
73 address3 = FAILURE
74 chunk1 = None
75 if self._offset < self._input_size:
76 chunk1 = self._input[self._offset:self._offset + 1]
77 if chunk1 == ':':
78 address3 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
79 self._offset = self._offset + 1
80 else:
81 address3 = FAILURE
82 if self._offset > self._failure:
83 self._failure = self._offset
84 self._expected = []
85 if self._offset == self._failure:
86 self._expected.append('":"')
87 if address3 is not FAILURE:
88 elements0.append(address3)
89 address4 = FAILURE
90 address4 = self._read_value()
91 if address4 is not FAILURE:
92 elements0.append(address4)
93 address5 = FAILURE
94 chunk2 = None
95 if self._offset < self._input_size:
96 chunk2 = self._input[self._offset:self._offset + 1]
97 if chunk2 == '}':
98 address5 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
99 self._offset = self._offset + 1
100 else:
101 address5 = FAILURE
102 if self._offset > self._failure:
103 self._failure = self._offset
104 self._expected = []
105 if self._offset == self._failure:
106 self._expected.append('"}"')
107 if address5 is not FAILURE:
108 elements0.append(address5)
109 else:
110 elements0 = None
111 self._offset = index1
112 else:
113 elements0 = None
114 self._offset = index1
115 else:
116 elements0 = None
117 self._offset = index1
118 else:
119 elements0 = None
120 self._offset = index1
121 else:
122 elements0 = None
123 self._offset = index1
124 if elements0 is None:
125 address0 = FAILURE
126 else:
127 address0 = self._actions.make_map(self._input, index1, self._offset, elements0)
128 self._offset = self._offset
129 self._cache['map'][index0] = (address0, self._offset)
130 return address0
131
132 def _read_string(self):
133 address0, index0 = FAILURE, self._offset
134 cached = self._cache['string'].get(index0)
135 if cached:
136 self._offset = cached[1]
137 return cached[0]
138 index1, elements0 = self._offset, []
139 address1 = FAILURE
140 chunk0 = None
141 if self._offset < self._input_size:
142 chunk0 = self._input[self._offset:self._offset + 1]
143 if chunk0 == '\'':
144 address1 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
145 self._offset = self._offset + 1
146 else:
147 address1 = FAILURE
148 if self._offset > self._failure:
149 self._failure = self._offset
150 self._expected = []
151 if self._offset == self._failure:
152 self._expected.append('"\'"')
153 if address1 is not FAILURE:
154 elements0.append(address1)
155 address2 = FAILURE
156 remaining0, index2, elements1, address3 = 0, self._offset, [], True
157 while address3 is not FAILURE:
158 chunk1 = None
159 if self._offset < self._input_size:
160 chunk1 = self._input[self._offset:self._offset + 1]
161 if chunk1 is not None and Grammar.REGEX_1.search(chunk1):
162 address3 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
163 self._offset = self._offset + 1
164 else:
165 address3 = FAILURE
166 if self._offset > self._failure:
167 self._failure = self._offset
168 self._expected = []
169 if self._offset == self._failure:
170 self._expected.append('[^\']')
171 if address3 is not FAILURE:
172 elements1.append(address3)
173 remaining0 -= 1
174 if remaining0 <= 0:
175 address2 = TreeNode(self._input[index2:self._offset], index2, elements1)
176 self._offset = self._offset
177 else:
178 address2 = FAILURE
179 if address2 is not FAILURE:
180 elements0.append(address2)
181 address4 = FAILURE
182 chunk2 = None
183 if self._offset < self._input_size:
184 chunk2 = self._input[self._offset:self._offset + 1]
185 if chunk2 == '\'':
186 address4 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
187 self._offset = self._offset + 1
188 else:
189 address4 = FAILURE
190 if self._offset > self._failure:
191 self._failure = self._offset
192 self._expected = []
193 if self._offset == self._failure:
194 self._expected.append('"\'"')
195 if address4 is not FAILURE:
196 elements0.append(address4)
197 else:
198 elements0 = None
199 self._offset = index1
200 else:
201 elements0 = None
202 self._offset = index1
203 else:
204 elements0 = None
205 self._offset = index1
206 if elements0 is None:
207 address0 = FAILURE
208 else:
209 address0 = self._actions.make_string(self._input, index1, self._offset, elements0)
210 self._offset = self._offset
211 self._cache['string'][index0] = (address0, self._offset)
212 return address0
213
214 def _read_value(self):
215 address0, index0 = FAILURE, self._offset
216 cached = self._cache['value'].get(index0)
217 if cached:
218 self._offset = cached[1]
219 return cached[0]
220 index1 = self._offset
221 address0 = self._read_list()
222 if address0 is FAILURE:
223 self._offset = index1
224 address0 = self._read_number()
225 if address0 is FAILURE:
226 self._offset = index1
227 self._cache['value'][index0] = (address0, self._offset)
228 return address0
229
230 def _read_list(self):
231 address0, index0 = FAILURE, self._offset
232 cached = self._cache['list'].get(index0)
233 if cached:
234 self._offset = cached[1]
235 return cached[0]
236 index1, elements0 = self._offset, []
237 address1 = FAILURE
238 chunk0 = None
239 if self._offset < self._input_size:
240 chunk0 = self._input[self._offset:self._offset + 1]
241 if chunk0 == '[':
242 address1 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
243 self._offset = self._offset + 1
244 else:
245 address1 = FAILURE
246 if self._offset > self._failure:
247 self._failure = self._offset
248 self._expected = []
249 if self._offset == self._failure:
250 self._expected.append('"["')
251 if address1 is not FAILURE:
252 elements0.append(address1)
253 address2 = FAILURE
254 address2 = self._read_value()
255 if address2 is not FAILURE:
256 elements0.append(address2)
257 address3 = FAILURE
258 remaining0, index2, elements1, address4 = 0, self._offset, [], True
259 while address4 is not FAILURE:
260 index3, elements2 = self._offset, []
261 address5 = FAILURE
262 chunk1 = None
263 if self._offset < self._input_size:
264 chunk1 = self._input[self._offset:self._offset + 1]
265 if chunk1 == ',':
266 address5 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
267 self._offset = self._offset + 1
268 else:
269 address5 = FAILURE
270 if self._offset > self._failure:
271 self._failure = self._offset
272 self._expected = []
273 if self._offset == self._failure:
274 self._expected.append('","')
275 if address5 is not FAILURE:
276 elements2.append(address5)
277 address6 = FAILURE
278 address6 = self._read_value()
279 if address6 is not FAILURE:
280 elements2.append(address6)
281 else:
282 elements2 = None
283 self._offset = index3
284 else:
285 elements2 = None
286 self._offset = index3
287 if elements2 is None:
288 address4 = FAILURE
289 else:
290 address4 = TreeNode3(self._input[index3:self._offset], index3, elements2)
291 self._offset = self._offset
292 if address4 is not FAILURE:
293 elements1.append(address4)
294 remaining0 -= 1
295 if remaining0 <= 0:
296 address3 = TreeNode(self._input[index2:self._offset], index2, elements1)
297 self._offset = self._offset
298 else:
299 address3 = FAILURE
300 if address3 is not FAILURE:
301 elements0.append(address3)
302 address7 = FAILURE
303 chunk2 = None
304 if self._offset < self._input_size:
305 chunk2 = self._input[self._offset:self._offset + 1]
306 if chunk2 == ']':
307 address7 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
308 self._offset = self._offset + 1
309 else:
310 address7 = FAILURE
311 if self._offset > self._failure:
312 self._failure = self._offset
313 self._expected = []
314 if self._offset == self._failure:
315 self._expected.append('"]"')
316 if address7 is not FAILURE:
317 elements0.append(address7)
318 else:
319 elements0 = None
320 self._offset = index1
321 else:
322 elements0 = None
323 self._offset = index1
324 else:
325 elements0 = None
326 self._offset = index1
327 else:
328 elements0 = None
329 self._offset = index1
330 if elements0 is None:
331 address0 = FAILURE
332 else:
333 address0 = self._actions.make_list(self._input, index1, self._offset, elements0)
334 self._offset = self._offset
335 self._cache['list'][index0] = (address0, self._offset)
336 return address0
337
338 def _read_number(self):
339 address0, index0 = FAILURE, self._offset
340 cached = self._cache['number'].get(index0)
341 if cached:
342 self._offset = cached[1]
343 return cached[0]
344 remaining0, index1, elements0, address1 = 1, self._offset, [], True
345 while address1 is not FAILURE:
346 chunk0 = None
347 if self._offset < self._input_size:
348 chunk0 = self._input[self._offset:self._offset + 1]
349 if chunk0 is not None and Grammar.REGEX_2.search(chunk0):
350 address1 = TreeNode(self._input[self._offset:self._offset + 1], self._offset)
351 self._offset = self._offset + 1
352 else:
353 address1 = FAILURE
354 if self._offset > self._failure:
355 self._failure = self._offset
356 self._expected = []
357 if self._offset == self._failure:
358 self._expected.append('[0-9]')
359 if address1 is not FAILURE:
360 elements0.append(address1)
361 remaining0 -= 1
362 if remaining0 <= 0:
363 address0 = self._actions.make_number(self._input, index1, self._offset, elements0)
364 self._offset = self._offset
365 else:
366 address0 = FAILURE
367 self._cache['number'][index0] = (address0, self._offset)
368 return address0
369
370
371 class Parser(Grammar):
372 def __init__(self, input, actions, types):
373 self._input = input
374 self._input_size = len(input)
375 self._actions = actions
376 self._types = types
377 self._offset = 0
378 self._cache = defaultdict(dict)
379 self._failure = 0
380 self._expected = []
381
382 def parse(self):
383 tree = self._read_map()
384 if tree is not FAILURE and self._offset == self._input_size:
385 return tree
386 if not self._expected:
387 self._failure = self._offset
388 self._expected.append('<EOF>')
389 raise ParseError(format_error(self._input, self._failure, self._expected))
390
391
392 def format_error(input, offset, expected):
393 lines, line_no, position = input.split('\n'), 0, 0
394 while position <= offset:
395 position += len(lines[line_no]) + 1
396 line_no += 1
397 message, line = 'Line ' + str(line_no) + ': expected ' + ', '.join(expected) + '\n', lines[line_no - 1]
398 message += line + '\n'
399 position -= len(line) + 1
400 message += ' ' * (offset - position)
401 return message + '^'
402
403 def parse(input, actions=None, types=None):
404 parser = Parser(input, actions, types)
405 return parser.parse()
ご覧の通り「読みたくない」スクリプトの典型だが、なんつーかコメントくらい入らんのかな。
あとはこれをモジュールとして使えばよろしい:
1 # -*- coding: utf-8 -*-
2 import maps # generated my parser
3
4 # action mapping (associated to %xxx in maps.peg)
5 class Actions(object):
6 def make_map(self, input, start, end, elements):
7 """
8 map <- "{" string ":" value "}"
9 """
10 return {elements[1]: elements[3]}
11
12 def make_string(self, input, start, end, elements):
13 """
14 string <- "'" [^']* "'"
15 """
16 return elements[1]
17
18 def make_list(self, input, start, end, elements):
19 """
20 list <- "[" value ("," value)* "]"
21 """
22 return elements[1::2]
23
24 def make_number(self, input, start, end, elements):
25 """
26 number <- [0-9]+
27 """
28 return elements[0]
29
30
31 # parse out input
32 result = maps.parse(
33 """{'x':[3,9,2]}""",
34 actions=Actions())
35 print(result)
アクションの実装は期待のものとは違うが、まぁ「こんな具合」て雰囲気ね。あと例にした構文、ちょっとヘンかもね。
ちゃんと PEG だし、アクション記述もやりやすいよね、てのが「是」の部分だが、無論「非」の部分もあって、「静的ジェネレータ」には共通だが、当然「開発初期とテスト工程のテンポは落ちる」のは当然。これは C/C++ のようなコンパイル型言語での開発スタイルと似たものだから。けど「それがいい」ってケースはあると思う。
Canopy、悪くないと思う。まぁ node.js の導入を許容出来ないなら別だけど。