source file: /System/Library/Frameworks/Python.framework/Versions/2.3/lib/python2.3/sre_compile.py
file stats: 360 lines, 224 executed: 62.2% covered
1. # 2. # Secret Labs' Regular Expression Engine 3. # 4. # convert template to internal format 5. # 6. # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7. # 8. # See the sre.py file for information on usage and redistribution. 9. # 10. 11. """Internal support module for sre""" 12. 13. import _sre, sys 14. 15. from sre_constants import * 16. 17. assert _sre.MAGIC == MAGIC, "SRE module mismatch" 18. 19. if _sre.CODESIZE == 2: 20. MAXCODE = 65535 21. else: 22. MAXCODE = 0xFFFFFFFFL 23. 24. def _compile(code, pattern, flags): 25. # internal: compile a (sub)pattern 26. emit = code.append 27. for op, av in pattern: 28. if op in (LITERAL, NOT_LITERAL): 29. if flags & SRE_FLAG_IGNORECASE: 30. emit(OPCODES[OP_IGNORE[op]]) 31. emit(_sre.getlower(av, flags)) 32. else: 33. emit(OPCODES[op]) 34. emit(av) 35. elif op is IN: 36. if flags & SRE_FLAG_IGNORECASE: 37. emit(OPCODES[OP_IGNORE[op]]) 38. def fixup(literal, flags=flags): 39. return _sre.getlower(literal, flags) 40. else: 41. emit(OPCODES[op]) 42. fixup = lambda x: x 43. skip = len(code); emit(0) 44. _compile_charset(av, flags, code, fixup) 45. code[skip] = len(code) - skip 46. elif op is ANY: 47. if flags & SRE_FLAG_DOTALL: 48. emit(OPCODES[ANY_ALL]) 49. else: 50. emit(OPCODES[ANY]) 51. elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): 52. if flags & SRE_FLAG_TEMPLATE: 53. raise error, "internal: unsupported template operator" 54. emit(OPCODES[REPEAT]) 55. skip = len(code); emit(0) 56. emit(av[0]) 57. emit(av[1]) 58. _compile(code, av[2], flags) 59. emit(OPCODES[SUCCESS]) 60. code[skip] = len(code) - skip 61. elif _simple(av) and op != REPEAT: 62. if op == MAX_REPEAT: 63. emit(OPCODES[REPEAT_ONE]) 64. else: 65. emit(OPCODES[MIN_REPEAT_ONE]) 66. skip = len(code); emit(0) 67. emit(av[0]) 68. emit(av[1]) 69. _compile(code, av[2], flags) 70. emit(OPCODES[SUCCESS]) 71. code[skip] = len(code) - skip 72. else: 73. emit(OPCODES[REPEAT]) 74. skip = len(code); emit(0) 75. emit(av[0]) 76. emit(av[1]) 77. _compile(code, av[2], flags) 78. code[skip] = len(code) - skip 79. if op == MAX_REPEAT: 80. emit(OPCODES[MAX_UNTIL]) 81. else: 82. emit(OPCODES[MIN_UNTIL]) 83. elif op is SUBPATTERN: 84. if av[0]: 85. emit(OPCODES[MARK]) 86. emit((av[0]-1)*2) 87. # _compile_info(code, av[1], flags) 88. _compile(code, av[1], flags) 89. if av[0]: 90. emit(OPCODES[MARK]) 91. emit((av[0]-1)*2+1) 92. elif op in (SUCCESS, FAILURE): 93. emit(OPCODES[op]) 94. elif op in (ASSERT, ASSERT_NOT): 95. emit(OPCODES[op]) 96. skip = len(code); emit(0) 97. if av[0] >= 0: 98. emit(0) # look ahead 99. else: 100. lo, hi = av[1].getwidth() 101. if lo != hi: 102. raise error, "look-behind requires fixed-width pattern" 103. emit(lo) # look behind 104. _compile(code, av[1], flags) 105. emit(OPCODES[SUCCESS]) 106. code[skip] = len(code) - skip 107. elif op is CALL: 108. emit(OPCODES[op]) 109. skip = len(code); emit(0) 110. _compile(code, av, flags) 111. emit(OPCODES[SUCCESS]) 112. code[skip] = len(code) - skip 113. elif op is AT: 114. emit(OPCODES[op]) 115. if flags & SRE_FLAG_MULTILINE: 116. av = AT_MULTILINE.get(av, av) 117. if flags & SRE_FLAG_LOCALE: 118. av = AT_LOCALE.get(av, av) 119. elif flags & SRE_FLAG_UNICODE: 120. av = AT_UNICODE.get(av, av) 121. emit(ATCODES[av]) 122. elif op is BRANCH: 123. emit(OPCODES[op]) 124. tail = [] 125. for av in av[1]: 126. skip = len(code); emit(0) 127. # _compile_info(code, av, flags) 128. _compile(code, av, flags) 129. emit(OPCODES[JUMP]) 130. tail.append(len(code)); emit(0) 131. code[skip] = len(code) - skip 132. emit(0) # end of branch 133. for tail in tail: 134. code[tail] = len(code) - tail 135. elif op is CATEGORY: 136. emit(OPCODES[op]) 137. if flags & SRE_FLAG_LOCALE: 138. av = CH_LOCALE[av] 139. elif flags & SRE_FLAG_UNICODE: 140. av = CH_UNICODE[av] 141. emit(CHCODES[av]) 142. elif op is GROUPREF: 143. if flags & SRE_FLAG_IGNORECASE: 144. emit(OPCODES[OP_IGNORE[op]]) 145. else: 146. emit(OPCODES[op]) 147. emit(av-1) 148. else: 149. raise ValueError, ("unsupported operand type", op) 150. 151. def _compile_charset(charset, flags, code, fixup=None): 152. # compile charset subprogram 153. emit = code.append 154. if fixup is None: 155. fixup = lambda x: x 156. for op, av in _optimize_charset(charset, fixup): 157. emit(OPCODES[op]) 158. if op is NEGATE: 159. pass 160. elif op is LITERAL: 161. emit(fixup(av)) 162. elif op is RANGE: 163. emit(fixup(av[0])) 164. emit(fixup(av[1])) 165. elif op is CHARSET: 166. code.extend(av) 167. elif op is BIGCHARSET: 168. code.extend(av) 169. elif op is CATEGORY: 170. if flags & SRE_FLAG_LOCALE: 171. emit(CHCODES[CH_LOCALE[av]]) 172. elif flags & SRE_FLAG_UNICODE: 173. emit(CHCODES[CH_UNICODE[av]]) 174. else: 175. emit(CHCODES[av]) 176. else: 177. raise error, "internal: unsupported set operator" 178. emit(OPCODES[FAILURE]) 179. 180. def _optimize_charset(charset, fixup): 181. # internal: optimize character set 182. out = [] 183. charmap = [0]*256 184. try: 185. for op, av in charset: 186. if op is NEGATE: 187. out.append((op, av)) 188. elif op is LITERAL: 189. charmap[fixup(av)] = 1 190. elif op is RANGE: 191. for i in range(fixup(av[0]), fixup(av[1])+1): 192. charmap[i] = 1 193. elif op is CATEGORY: 194. # XXX: could append to charmap tail 195. return charset # cannot compress 196. except IndexError: 197. # character set contains unicode characters 198. return _optimize_unicode(charset, fixup) 199. # compress character map 200. i = p = n = 0 201. runs = [] 202. for c in charmap: 203. if c: 204. if n == 0: 205. p = i 206. n = n + 1 207. elif n: 208. runs.append((p, n)) 209. n = 0 210. i = i + 1 211. if n: 212. runs.append((p, n)) 213. if len(runs) <= 2: 214. # use literal/range 215. for p, n in runs: 216. if n == 1: 217. out.append((LITERAL, p)) 218. else: 219. out.append((RANGE, (p, p+n-1))) 220. if len(out) < len(charset): 221. return out 222. else: 223. # use bitmap 224. data = _mk_bitmap(charmap) 225. out.append((CHARSET, data)) 226. return out 227. return charset 228. 229. def _mk_bitmap(bits): 230. data = [] 231. if _sre.CODESIZE == 2: 232. start = (1, 0) 233. else: 234. start = (1L, 0L) 235. m, v = start 236. for c in bits: 237. if c: 238. v = v + m 239. m = m << 1 240. if m > MAXCODE: 241. data.append(v) 242. m, v = start 243. return data 244. 245. # To represent a big charset, first a bitmap of all characters in the 246. # set is constructed. Then, this bitmap is sliced into chunks of 256 247. # characters, duplicate chunks are eliminitated, and each chunk is 248. # given a number. In the compiled expression, the charset is 249. # represented by a 16-bit word sequence, consisting of one word for 250. # the number of different chunks, a sequence of 256 bytes (128 words) 251. # of chunk numbers indexed by their original chunk position, and a 252. # sequence of chunks (16 words each). 253. 254. # Compression is normally good: in a typical charset, large ranges of 255. # Unicode will be either completely excluded (e.g. if only cyrillic 256. # letters are to be matched), or completely included (e.g. if large 257. # subranges of Kanji match). These ranges will be represented by 258. # chunks of all one-bits or all zero-bits. 259. 260. # Matching can be also done efficiently: the more significant byte of 261. # the Unicode character is an index into the chunk number, and the 262. # less significant byte is a bit index in the chunk (just like the 263. # CHARSET matching). 264. 265. # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 266. # of the basic multilingual plane; an efficient representation 267. # for all of UTF-16 has not yet been developed. This means, 268. # in particular, that negated charsets cannot be represented as 269. # bigcharsets. 270. 271. def _optimize_unicode(charset, fixup): 272. try: 273. import array 274. except ImportError: 275. return charset 276. charmap = [0]*65536 277. negate = 0 278. try: 279. for op, av in charset: 280. if op is NEGATE: 281. negate = 1 282. elif op is LITERAL: 283. charmap[fixup(av)] = 1 284. elif op is RANGE: 285. for i in range(fixup(av[0]), fixup(av[1])+1): 286. charmap[i] = 1 287. elif op is CATEGORY: 288. # XXX: could expand category 289. return charset # cannot compress 290. except IndexError: 291. # non-BMP characters 292. return charset 293. if negate: 294. if sys.maxunicode != 65535: 295. # XXX: negation does not work with big charsets 296. return charset 297. for i in range(65536): 298. charmap[i] = not charmap[i] 299. comps = {} 300. mapping = [0]*256 301. block = 0 302. data = [] 303. for i in range(256): 304. chunk = tuple(charmap[i*256:(i+1)*256]) 305. new = comps.setdefault(chunk, block) 306. mapping[i] = new 307. if new == block: 308. block = block + 1 309. data = data + _mk_bitmap(chunk) 310. header = [block] 311. if _sre.CODESIZE == 2: 312. code = 'H' 313. else: 314. code = 'I' 315. # Convert block indices to byte array of 256 bytes 316. mapping = array.array('b', mapping).tostring() 317. # Convert byte array to word array 318. mapping = array.array(code, mapping) 319. assert mapping.itemsize == _sre.CODESIZE 320. header = header + mapping.tolist() 321. data[0:0] = header 322. return [(BIGCHARSET, data)] 323. 324. def _simple(av): 325. # check if av is a "simple" operator 326. lo, hi = av[2].getwidth() 327. if lo == 0 and hi == MAXREPEAT: 328. raise error, "nothing to repeat" 329. return lo == hi == 1 and av[2][0][0] != SUBPATTERN 330. 331. def _compile_info(code, pattern, flags): 332. # internal: compile an info block. in the current version, 333. # this contains min/max pattern width, and an optional literal 334. # prefix or a character map 335. lo, hi = pattern.getwidth() 336. if lo == 0: 337. return # not worth it 338. # look for a literal prefix 339. prefix = [] 340. prefix_skip = 0 341. charset = [] # not used 342. if not (flags & SRE_FLAG_IGNORECASE): 343. # look for literal prefix 344. for op, av in pattern.data: 345. if op is LITERAL: 346. if len(prefix) == prefix_skip: 347. prefix_skip = prefix_skip + 1 348. prefix.append(av) 349. elif op is SUBPATTERN and len(av[1]) == 1: 350. op, av = av[1][0] 351. if op is LITERAL: 352. prefix.append(av) 353. else: 354. break 355. else: 356. break 357. # if no prefix, look for charset prefix 358. if not prefix and pattern.data: 359. op, av = pattern.data[0] 360. if op is SUBPATTERN and av[1]: 361. op, av = av[1][0] 362. if op is LITERAL: 363. charset.append((op, av)) 364. elif op is BRANCH: 365. c = [] 366. for p in av[1]: 367. if not p: 368. break 369. op, av = p[0] 370. if op is LITERAL: 371. c.append((op, av)) 372. else: 373. break 374. else: 375. charset = c 376. elif op is BRANCH: 377. c = [] 378. for p in av[1]: 379. if not p: 380. break 381. op, av = p[0] 382. if op is LITERAL: 383. c.append((op, av)) 384. else: 385. break 386. else: 387. charset = c 388. elif op is IN: 389. charset = av 390. ## if prefix: 391. ## print "*** PREFIX", prefix, prefix_skip 392. ## if charset: 393. ## print "*** CHARSET", charset 394. # add an info block 395. emit = code.append 396. emit(OPCODES[INFO]) 397. skip = len(code); emit(0) 398. # literal flag 399. mask = 0 400. if prefix: 401. mask = SRE_INFO_PREFIX 402. if len(prefix) == prefix_skip == len(pattern.data): 403. mask = mask + SRE_INFO_LITERAL 404. elif charset: 405. mask = mask + SRE_INFO_CHARSET 406. emit(mask) 407. # pattern length 408. if lo < MAXCODE: 409. emit(lo) 410. else: 411. emit(MAXCODE) 412. prefix = prefix[:MAXCODE] 413. if hi < MAXCODE: 414. emit(hi) 415. else: 416. emit(0) 417. # add literal prefix 418. if prefix: 419. emit(len(prefix)) # length 420. emit(prefix_skip) # skip 421. code.extend(prefix) 422. # generate overlap table 423. table = [-1] + ([0]*len(prefix)) 424. for i in range(len(prefix)): 425. table[i+1] = table[i]+1 426. while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 427. table[i+1] = table[table[i+1]-1]+1 428. code.extend(table[1:]) # don't store first entry 429. elif charset: 430. _compile_charset(charset, flags, code) 431. code[skip] = len(code) - skip 432. 433. try: 434. unicode 435. except NameError: 436. STRING_TYPES = (type(""),) 437. else: 438. STRING_TYPES = (type(""), type(unicode(""))) 439. 440. def isstring(obj): 441. for tp in STRING_TYPES: 442. if isinstance(obj, tp): 443. return 1 444. return 0 445. 446. def _code(p, flags): 447. 448. flags = p.pattern.flags | flags 449. code = [] 450. 451. # compile info block 452. _compile_info(code, p, flags) 453. 454. # compile the pattern 455. _compile(code, p.data, flags) 456. 457. code.append(OPCODES[SUCCESS]) 458. 459. return code 460. 461. def compile(p, flags=0): 462. # internal: convert pattern list to internal format 463. 464. if isstring(p): 465. import sre_parse 466. pattern = p 467. p = sre_parse.parse(p, flags) 468. else: 469. pattern = None 470. 471. code = _code(p, flags) 472. 473. # print code 474. 475. # XXX: <fl> get rid of this limitation! 476. assert p.pattern.groups <= 100,\ 477. "sorry, but this version only supports 100 named groups" 478. 479. # map in either direction 480. groupindex = p.pattern.groupdict 481. indexgroup = [None] * p.pattern.groups 482. for k, i in groupindex.items(): 483. indexgroup[i] = k 484. 485. return _sre.compile( 486. pattern, flags, code, 487. p.pattern.groups-1, 488. groupindex, indexgroup 489. )