source file: /System/Library/Frameworks/Python.framework/Versions/2.3/lib/python2.3/sre_parse.py
file stats: 561 lines, 277 executed: 49.4% covered
1. # 2. # Secret Labs' Regular Expression Engine 3. # 4. # convert re-style regular expression to sre pattern 5. # 6. # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7. # 8. # See the sre.py file for information on usage and redistribution. 9. # 10. 11. """Internal support module for sre""" 12. 13. # XXX: show string offset and offending character for all errors 14. 15. # this module works under 1.5.2 and later. don't use string methods 16. import string, sys 17. 18. from sre_constants import * 19. 20. SPECIAL_CHARS = ".\\[{()*+?^$|" 21. REPEAT_CHARS = "*+?{" 22. 23. DIGITS = tuple("0123456789") 24. 25. OCTDIGITS = tuple("01234567") 26. HEXDIGITS = tuple("0123456789abcdefABCDEF") 27. 28. WHITESPACE = tuple(" \t\n\r\v\f") 29. 30. ESCAPES = { 31. r"\a": (LITERAL, ord("\a")), 32. r"\b": (LITERAL, ord("\b")), 33. r"\f": (LITERAL, ord("\f")), 34. r"\n": (LITERAL, ord("\n")), 35. r"\r": (LITERAL, ord("\r")), 36. r"\t": (LITERAL, ord("\t")), 37. r"\v": (LITERAL, ord("\v")), 38. r"\\": (LITERAL, ord("\\")) 39. } 40. 41. CATEGORIES = { 42. r"\A": (AT, AT_BEGINNING_STRING), # start of string 43. r"\b": (AT, AT_BOUNDARY), 44. r"\B": (AT, AT_NON_BOUNDARY), 45. r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 46. r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 47. r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 48. r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 49. r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 50. r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 51. r"\Z": (AT, AT_END_STRING), # end of string 52. } 53. 54. FLAGS = { 55. # standard flags 56. "i": SRE_FLAG_IGNORECASE, 57. "L": SRE_FLAG_LOCALE, 58. "m": SRE_FLAG_MULTILINE, 59. "s": SRE_FLAG_DOTALL, 60. "x": SRE_FLAG_VERBOSE, 61. # extensions 62. "t": SRE_FLAG_TEMPLATE, 63. "u": SRE_FLAG_UNICODE, 64. } 65. 66. # figure out best way to convert hex/octal numbers to integers 67. try: 68. int("10", 8) 69. atoi = int # 2.0 and later 70. except TypeError: 71. atoi = string.atoi # 1.5.2 72. 73. class Pattern: 74. # master pattern object. keeps track of global attributes 75. def __init__(self): 76. self.flags = 0 77. self.open = [] 78. self.groups = 1 79. self.groupdict = {} 80. def opengroup(self, name=None): 81. gid = self.groups 82. self.groups = gid + 1 83. if name is not None: 84. ogid = self.groupdict.get(name, None) 85. if ogid is not None: 86. raise error, ("redefinition of group name %s as group %d; " 87. "was group %d" % (repr(name), gid, ogid)) 88. self.groupdict[name] = gid 89. self.open.append(gid) 90. return gid 91. def closegroup(self, gid): 92. self.open.remove(gid) 93. def checkgroup(self, gid): 94. return gid < self.groups and gid not in self.open 95. 96. class SubPattern: 97. # a subpattern, in intermediate form 98. def __init__(self, pattern, data=None): 99. self.pattern = pattern 100. if data is None: 101. data = [] 102. self.data = data 103. self.width = None 104. def dump(self, level=0): 105. nl = 1 106. for op, av in self.data: 107. print level*" " + op,; nl = 0 108. if op == "in": 109. # member sublanguage 110. print; nl = 1 111. for op, a in av: 112. print (level+1)*" " + op, a 113. elif op == "branch": 114. print; nl = 1 115. i = 0 116. for a in av[1]: 117. if i > 0: 118. print level*" " + "or" 119. a.dump(level+1); nl = 1 120. i = i + 1 121. elif type(av) in (type(()), type([])): 122. for a in av: 123. if isinstance(a, SubPattern): 124. if not nl: print 125. a.dump(level+1); nl = 1 126. else: 127. print a, ; nl = 0 128. else: 129. print av, ; nl = 0 130. if not nl: print 131. def __repr__(self): 132. return repr(self.data) 133. def __len__(self): 134. return len(self.data) 135. def __delitem__(self, index): 136. del self.data[index] 137. def __getitem__(self, index): 138. return self.data[index] 139. def __setitem__(self, index, code): 140. self.data[index] = code 141. def __getslice__(self, start, stop): 142. return SubPattern(self.pattern, self.data[start:stop]) 143. def insert(self, index, code): 144. self.data.insert(index, code) 145. def append(self, code): 146. self.data.append(code) 147. def getwidth(self): 148. # determine the width (min, max) for this subpattern 149. if self.width: 150. return self.width 151. lo = hi = 0L 152. for op, av in self.data: 153. if op is BRANCH: 154. i = sys.maxint 155. j = 0 156. for av in av[1]: 157. l, h = av.getwidth() 158. i = min(i, l) 159. j = max(j, h) 160. lo = lo + i 161. hi = hi + j 162. elif op is CALL: 163. i, j = av.getwidth() 164. lo = lo + i 165. hi = hi + j 166. elif op is SUBPATTERN: 167. i, j = av[1].getwidth() 168. lo = lo + i 169. hi = hi + j 170. elif op in (MIN_REPEAT, MAX_REPEAT): 171. i, j = av[2].getwidth() 172. lo = lo + long(i) * av[0] 173. hi = hi + long(j) * av[1] 174. elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): 175. lo = lo + 1 176. hi = hi + 1 177. elif op == SUCCESS: 178. break 179. self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) 180. return self.width 181. 182. class Tokenizer: 183. def __init__(self, string): 184. self.string = string 185. self.index = 0 186. self.__next() 187. def __next(self): 188. if self.index >= len(self.string): 189. self.next = None 190. return 191. char = self.string[self.index] 192. if char[0] == "\\": 193. try: 194. c = self.string[self.index + 1] 195. except IndexError: 196. raise error, "bogus escape (end of line)" 197. char = char + c 198. self.index = self.index + len(char) 199. self.next = char 200. def match(self, char, skip=1): 201. if char == self.next: 202. if skip: 203. self.__next() 204. return 1 205. return 0 206. def get(self): 207. this = self.next 208. self.__next() 209. return this 210. def tell(self): 211. return self.index, self.next 212. def seek(self, index): 213. self.index, self.next = index 214. 215. def isident(char): 216. return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" 217. 218. def isdigit(char): 219. return "0" <= char <= "9" 220. 221. def isname(name): 222. # check that group name is a valid string 223. if not isident(name[0]): 224. return False 225. for char in name: 226. if not isident(char) and not isdigit(char): 227. return False 228. return True 229. 230. def _group(escape, groups): 231. # check if the escape string represents a valid group 232. try: 233. gid = atoi(escape[1:]) 234. if gid and gid < groups: 235. return gid 236. except ValueError: 237. pass 238. return None # not a valid group 239. 240. def _class_escape(source, escape): 241. # handle escape code inside character class 242. code = ESCAPES.get(escape) 243. if code: 244. return code 245. code = CATEGORIES.get(escape) 246. if code: 247. return code 248. try: 249. if escape[1:2] == "x": 250. # hexadecimal escape (exactly two digits) 251. while source.next in HEXDIGITS and len(escape) < 4: 252. escape = escape + source.get() 253. escape = escape[2:] 254. if len(escape) != 2: 255. raise error, "bogus escape: %s" % repr("\\" + escape) 256. return LITERAL, atoi(escape, 16) & 0xff 257. elif escape[1:2] in OCTDIGITS: 258. # octal escape (up to three digits) 259. while source.next in OCTDIGITS and len(escape) < 5: 260. escape = escape + source.get() 261. escape = escape[1:] 262. return LITERAL, atoi(escape, 8) & 0xff 263. if len(escape) == 2: 264. return LITERAL, ord(escape[1]) 265. except ValueError: 266. pass 267. raise error, "bogus escape: %s" % repr(escape) 268. 269. def _escape(source, escape, state): 270. # handle escape code in expression 271. code = CATEGORIES.get(escape) 272. if code: 273. return code 274. code = ESCAPES.get(escape) 275. if code: 276. return code 277. try: 278. if escape[1:2] == "x": 279. # hexadecimal escape 280. while source.next in HEXDIGITS and len(escape) < 4: 281. escape = escape + source.get() 282. if len(escape) != 4: 283. raise ValueError 284. return LITERAL, atoi(escape[2:], 16) & 0xff 285. elif escape[1:2] == "0": 286. # octal escape 287. while source.next in OCTDIGITS and len(escape) < 4: 288. escape = escape + source.get() 289. return LITERAL, atoi(escape[1:], 8) & 0xff 290. elif escape[1:2] in DIGITS: 291. # octal escape *or* decimal group reference (sigh) 292. if source.next in DIGITS: 293. escape = escape + source.get() 294. if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 295. source.next in OCTDIGITS): 296. # got three octal digits; this is an octal escape 297. escape = escape + source.get() 298. return LITERAL, atoi(escape[1:], 8) & 0xff 299. # got at least one decimal digit; this is a group reference 300. group = _group(escape, state.groups) 301. if group: 302. if not state.checkgroup(group): 303. raise error, "cannot refer to open group" 304. return GROUPREF, group 305. raise ValueError 306. if len(escape) == 2: 307. return LITERAL, ord(escape[1]) 308. except ValueError: 309. pass 310. raise error, "bogus escape: %s" % repr(escape) 311. 312. def _parse_sub(source, state, nested=1): 313. # parse an alternation: a|b|c 314. 315. items = [] 316. while 1: 317. items.append(_parse(source, state)) 318. if source.match("|"): 319. continue 320. if not nested: 321. break 322. if not source.next or source.match(")", 0): 323. break 324. else: 325. raise error, "pattern not properly closed" 326. 327. if len(items) == 1: 328. return items[0] 329. 330. subpattern = SubPattern(state) 331. 332. # check if all items share a common prefix 333. while 1: 334. prefix = None 335. for item in items: 336. if not item: 337. break 338. if prefix is None: 339. prefix = item[0] 340. elif item[0] != prefix: 341. break 342. else: 343. # all subitems start with a common "prefix". 344. # move it out of the branch 345. for item in items: 346. del item[0] 347. subpattern.append(prefix) 348. continue # check next one 349. break 350. 351. # check if the branch can be replaced by a character set 352. for item in items: 353. if len(item) != 1 or item[0][0] != LITERAL: 354. break 355. else: 356. # we can store this as a character set instead of a 357. # branch (the compiler may optimize this even more) 358. set = [] 359. for item in items: 360. set.append(item[0]) 361. subpattern.append((IN, set)) 362. return subpattern 363. 364. subpattern.append((BRANCH, (None, items))) 365. return subpattern 366. 367. def _parse(source, state): 368. # parse a simple pattern 369. 370. subpattern = SubPattern(state) 371. 372. while 1: 373. 374. if source.next in ("|", ")"): 375. break # end of subpattern 376. this = source.get() 377. if this is None: 378. break # end of pattern 379. 380. if state.flags & SRE_FLAG_VERBOSE: 381. # skip whitespace and comments 382. if this in WHITESPACE: 383. continue 384. if this == "#": 385. while 1: 386. this = source.get() 387. if this in (None, "\n"): 388. break 389. continue 390. 391. if this and this[0] not in SPECIAL_CHARS: 392. subpattern.append((LITERAL, ord(this))) 393. 394. elif this == "[": 395. # character set 396. set = [] 397. ## if source.match(":"): 398. ## pass # handle character classes 399. if source.match("^"): 400. set.append((NEGATE, None)) 401. # check remaining characters 402. start = set[:] 403. while 1: 404. this = source.get() 405. if this == "]" and set != start: 406. break 407. elif this and this[0] == "\\": 408. code1 = _class_escape(source, this) 409. elif this: 410. code1 = LITERAL, ord(this) 411. else: 412. raise error, "unexpected end of regular expression" 413. if source.match("-"): 414. # potential range 415. this = source.get() 416. if this == "]": 417. if code1[0] is IN: 418. code1 = code1[1][0] 419. set.append(code1) 420. set.append((LITERAL, ord("-"))) 421. break 422. elif this: 423. if this[0] == "\\": 424. code2 = _class_escape(source, this) 425. else: 426. code2 = LITERAL, ord(this) 427. if code1[0] != LITERAL or code2[0] != LITERAL: 428. raise error, "bad character range" 429. lo = code1[1] 430. hi = code2[1] 431. if hi < lo: 432. raise error, "bad character range" 433. set.append((RANGE, (lo, hi))) 434. else: 435. raise error, "unexpected end of regular expression" 436. else: 437. if code1[0] is IN: 438. code1 = code1[1][0] 439. set.append(code1) 440. 441. # XXX: <fl> should move set optimization to compiler! 442. if len(set)==1 and set[0][0] is LITERAL: 443. subpattern.append(set[0]) # optimization 444. elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: 445. subpattern.append((NOT_LITERAL, set[1][1])) # optimization 446. else: 447. # XXX: <fl> should add charmap optimization here 448. subpattern.append((IN, set)) 449. 450. elif this and this[0] in REPEAT_CHARS: 451. # repeat previous item 452. if this == "?": 453. min, max = 0, 1 454. elif this == "*": 455. min, max = 0, MAXREPEAT 456. 457. elif this == "+": 458. min, max = 1, MAXREPEAT 459. elif this == "{": 460. here = source.tell() 461. min, max = 0, MAXREPEAT 462. lo = hi = "" 463. while source.next in DIGITS: 464. lo = lo + source.get() 465. if source.match(","): 466. while source.next in DIGITS: 467. hi = hi + source.get() 468. else: 469. hi = lo 470. if not source.match("}"): 471. subpattern.append((LITERAL, ord(this))) 472. source.seek(here) 473. continue 474. if lo: 475. min = atoi(lo) 476. if hi: 477. max = atoi(hi) 478. if max < min: 479. raise error, "bad repeat interval" 480. else: 481. raise error, "not supported" 482. # figure out which item to repeat 483. if subpattern: 484. item = subpattern[-1:] 485. else: 486. item = None 487. if not item or (len(item) == 1 and item[0][0] == AT): 488. raise error, "nothing to repeat" 489. if item[0][0] in (MIN_REPEAT, MAX_REPEAT): 490. raise error, "multiple repeat" 491. if source.match("?"): 492. subpattern[-1] = (MIN_REPEAT, (min, max, item)) 493. else: 494. subpattern[-1] = (MAX_REPEAT, (min, max, item)) 495. 496. elif this == ".": 497. subpattern.append((ANY, None)) 498. 499. elif this == "(": 500. group = 1 501. name = None 502. if source.match("?"): 503. group = 0 504. # options 505. if source.match("P"): 506. # python extensions 507. if source.match("<"): 508. # named group: skip forward to end of name 509. name = "" 510. while 1: 511. char = source.get() 512. if char is None: 513. raise error, "unterminated name" 514. if char == ">": 515. break 516. name = name + char 517. group = 1 518. if not isname(name): 519. raise error, "bad character in group name" 520. elif source.match("="): 521. # named backreference 522. name = "" 523. while 1: 524. char = source.get() 525. if char is None: 526. raise error, "unterminated name" 527. if char == ")": 528. break 529. name = name + char 530. if not isname(name): 531. raise error, "bad character in group name" 532. gid = state.groupdict.get(name) 533. if gid is None: 534. raise error, "unknown group name" 535. subpattern.append((GROUPREF, gid)) 536. continue 537. else: 538. char = source.get() 539. if char is None: 540. raise error, "unexpected end of pattern" 541. raise error, "unknown specifier: ?P%s" % char 542. elif source.match(":"): 543. # non-capturing group 544. group = 2 545. elif source.match("#"): 546. # comment 547. while 1: 548. if source.next is None or source.next == ")": 549. break 550. source.get() 551. if not source.match(")"): 552. raise error, "unbalanced parenthesis" 553. continue 554. elif source.next in ("=", "!", "<"): 555. # lookahead assertions 556. char = source.get() 557. dir = 1 558. if char == "<": 559. if source.next not in ("=", "!"): 560. raise error, "syntax error" 561. dir = -1 # lookbehind 562. char = source.get() 563. p = _parse_sub(source, state) 564. if not source.match(")"): 565. raise error, "unbalanced parenthesis" 566. if char == "=": 567. subpattern.append((ASSERT, (dir, p))) 568. else: 569. subpattern.append((ASSERT_NOT, (dir, p))) 570. continue 571. else: 572. # flags 573. if not source.next in FLAGS: 574. raise error, "unexpected end of pattern" 575. while source.next in FLAGS: 576. state.flags = state.flags | FLAGS[source.get()] 577. if group: 578. # parse group contents 579. if group == 2: 580. # anonymous group 581. group = None 582. else: 583. group = state.opengroup(name) 584. p = _parse_sub(source, state) 585. if not source.match(")"): 586. raise error, "unbalanced parenthesis" 587. if group is not None: 588. state.closegroup(group) 589. subpattern.append((SUBPATTERN, (group, p))) 590. else: 591. while 1: 592. char = source.get() 593. if char is None: 594. raise error, "unexpected end of pattern" 595. if char == ")": 596. break 597. raise error, "unknown extension" 598. 599. elif this == "^": 600. subpattern.append((AT, AT_BEGINNING)) 601. 602. elif this == "$": 603. subpattern.append((AT, AT_END)) 604. 605. elif this and this[0] == "\\": 606. code = _escape(source, this, state) 607. subpattern.append(code) 608. 609. else: 610. raise error, "parser error" 611. 612. return subpattern 613. 614. def parse(str, flags=0, pattern=None): 615. # parse 're' pattern into list of (opcode, argument) tuples 616. 617. source = Tokenizer(str) 618. 619. if pattern is None: 620. pattern = Pattern() 621. pattern.flags = flags 622. pattern.str = str 623. 624. p = _parse_sub(source, pattern, 0) 625. 626. tail = source.get() 627. if tail == ")": 628. raise error, "unbalanced parenthesis" 629. elif tail: 630. raise error, "bogus characters at end of regular expression" 631. 632. if flags & SRE_FLAG_DEBUG: 633. p.dump() 634. 635. if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: 636. # the VERBOSE flag was switched on inside the pattern. to be 637. # on the safe side, we'll parse the whole thing again... 638. return parse(str, p.pattern.flags) 639. 640. return p 641. 642. def parse_template(source, pattern): 643. # parse 're' replacement string into list of literals and 644. # group references 645. s = Tokenizer(source) 646. p = [] 647. a = p.append 648. def literal(literal, p=p): 649. if p and p[-1][0] is LITERAL: 650. p[-1] = LITERAL, p[-1][1] + literal 651. else: 652. p.append((LITERAL, literal)) 653. sep = source[:0] 654. if type(sep) is type(""): 655. makechar = chr 656. else: 657. makechar = unichr 658. while 1: 659. this = s.get() 660. if this is None: 661. break # end of replacement string 662. if this and this[0] == "\\": 663. # group 664. if this == "\\g": 665. name = "" 666. if s.match("<"): 667. while 1: 668. char = s.get() 669. if char is None: 670. raise error, "unterminated group name" 671. if char == ">": 672. break 673. name = name + char 674. if not name: 675. raise error, "bad group name" 676. try: 677. index = atoi(name) 678. except ValueError: 679. if not isname(name): 680. raise error, "bad character in group name" 681. try: 682. index = pattern.groupindex[name] 683. except KeyError: 684. raise IndexError, "unknown group name" 685. a((MARK, index)) 686. elif len(this) > 1 and this[1] in DIGITS: 687. code = None 688. while 1: 689. group = _group(this, pattern.groups+1) 690. if group: 691. if (s.next not in DIGITS or 692. not _group(this + s.next, pattern.groups+1)): 693. code = MARK, group 694. break 695. elif s.next in OCTDIGITS: 696. this = this + s.get() 697. else: 698. break 699. if not code: 700. this = this[1:] 701. code = LITERAL, makechar(atoi(this[-6:], 8) & 0xff) 702. if code[0] is LITERAL: 703. literal(code[1]) 704. else: 705. a(code) 706. else: 707. try: 708. this = makechar(ESCAPES[this][1]) 709. except KeyError: 710. pass 711. literal(this) 712. else: 713. literal(this) 714. # convert template to groups and literals lists 715. i = 0 716. groups = [] 717. literals = [] 718. for c, s in p: 719. if c is MARK: 720. groups.append((i, s)) 721. literals.append(None) 722. else: 723. literals.append(s) 724. i = i + 1 725. return groups, literals 726. 727. def expand_template(template, match): 728. g = match.group 729. sep = match.string[:0] 730. groups, literals = template 731. literals = literals[:] 732. try: 733. for index, group in groups: 734. literals[index] = s = g(group) 735. if s is None: 736. raise IndexError 737. except IndexError: 738. raise error, "empty group" 739. return string.join(literals, sep)