source file: /System/Library/Frameworks/Python.framework/Versions/2.3/lib/python2.3/email/_parseaddr.py
file stats: 302 lines, 37 executed: 12.3% covered
   1. # Copyright (C) 2002 Python Software Foundation
   2. 
   3. """Email address parsing code.
   4. 
   5. Lifted directly from rfc822.py.  This should eventually be rewritten.
   6. """
   7. 
   8. import time
   9. from types import TupleType
  10. 
  11. try:
  12.     True, False
  13. except NameError:
  14.     True = 1
  15.     False = 0
  16. 
  17. SPACE = ' '
  18. EMPTYSTRING = ''
  19. COMMASPACE = ', '
  20. 
  21. # Parse a date field
  22. _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
  23.                'aug', 'sep', 'oct', 'nov', 'dec',
  24.                'january', 'february', 'march', 'april', 'may', 'june', 'july',
  25.                'august', 'september', 'october', 'november', 'december']
  26. 
  27. _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
  28. 
  29. # The timezone table does not include the military time zones defined
  30. # in RFC822, other than Z.  According to RFC1123, the description in
  31. # RFC822 gets the signs wrong, so we can't rely on any such time
  32. # zones.  RFC1123 recommends that numeric timezone indicators be used
  33. # instead of timezone names.
  34. 
  35. _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
  36.               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
  37.               'EST': -500, 'EDT': -400,  # Eastern
  38.               'CST': -600, 'CDT': -500,  # Central
  39.               'MST': -700, 'MDT': -600,  # Mountain
  40.               'PST': -800, 'PDT': -700   # Pacific
  41.               }
  42. 
  43. 
  44. def parsedate_tz(data):
  45.     """Convert a date string to a time tuple.
  46. 
  47.     Accounts for military timezones.
  48.     """
  49.     data = data.split()
  50.     # The FWS after the comma after the day-of-week is optional, so search and
  51.     # adjust for this.
  52.     if data[0].endswith(',') or data[0].lower() in _daynames:
  53.         # There's a dayname here. Skip it
  54.         del data[0]
  55.     else:
  56.         i = data[0].rfind(',')
  57.         if i >= 0:
  58.             data[0] = data[0][i+1:]
  59.     if len(data) == 3: # RFC 850 date, deprecated
  60.         stuff = data[0].split('-')
  61.         if len(stuff) == 3:
  62.             data = stuff + data[1:]
  63.     if len(data) == 4:
  64.         s = data[3]
  65.         i = s.find('+')
  66.         if i > 0:
  67.             data[3:] = [s[:i], s[i+1:]]
  68.         else:
  69.             data.append('') # Dummy tz
  70.     if len(data) < 5:
  71.         return None
  72.     data = data[:5]
  73.     [dd, mm, yy, tm, tz] = data
  74.     mm = mm.lower()
  75.     if mm not in _monthnames:
  76.         dd, mm = mm, dd.lower()
  77.         if mm not in _monthnames:
  78.             return None
  79.     mm = _monthnames.index(mm) + 1
  80.     if mm > 12:
  81.         mm -= 12
  82.     if dd[-1] == ',':
  83.         dd = dd[:-1]
  84.     i = yy.find(':')
  85.     if i > 0:
  86.         yy, tm = tm, yy
  87.     if yy[-1] == ',':
  88.         yy = yy[:-1]
  89.     if not yy[0].isdigit():
  90.         yy, tz = tz, yy
  91.     if tm[-1] == ',':
  92.         tm = tm[:-1]
  93.     tm = tm.split(':')
  94.     if len(tm) == 2:
  95.         [thh, tmm] = tm
  96.         tss = '0'
  97.     elif len(tm) == 3:
  98.         [thh, tmm, tss] = tm
  99.     else:
 100.         return None
 101.     try:
 102.         yy = int(yy)
 103.         dd = int(dd)
 104.         thh = int(thh)
 105.         tmm = int(tmm)
 106.         tss = int(tss)
 107.     except ValueError:
 108.         return None
 109.     tzoffset = None
 110.     tz = tz.upper()
 111.     if _timezones.has_key(tz):
 112.         tzoffset = _timezones[tz]
 113.     else:
 114.         try:
 115.             tzoffset = int(tz)
 116.         except ValueError:
 117.             pass
 118.     # Convert a timezone offset into seconds ; -0500 -> -18000
 119.     if tzoffset:
 120.         if tzoffset < 0:
 121.             tzsign = -1
 122.             tzoffset = -tzoffset
 123.         else:
 124.             tzsign = 1
 125.         tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
 126.     tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
 127.     return tuple
 128. 
 129. 
 130. def parsedate(data):
 131.     """Convert a time string to a time tuple."""
 132.     t = parsedate_tz(data)
 133.     if isinstance(t, TupleType):
 134.         return t[:9]
 135.     else:
 136.         return t
 137. 
 138. 
 139. def mktime_tz(data):
 140.     """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
 141.     if data[9] is None:
 142.         # No zone info, so localtime is better assumption than GMT
 143.         return time.mktime(data[:8] + (-1,))
 144.     else:
 145.         t = time.mktime(data[:8] + (0,))
 146.         return t - data[9] - time.timezone
 147. 
 148. 
 149. def quote(str):
 150.     """Add quotes around a string."""
 151.     return str.replace('\\', '\\\\').replace('"', '\\"')
 152. 
 153. 
 154. class AddrlistClass:
 155.     """Address parser class by Ben Escoto.
 156. 
 157.     To understand what this class does, it helps to have a copy of RFC 2822 in
 158.     front of you.
 159. 
 160.     Note: this class interface is deprecated and may be removed in the future.
 161.     Use rfc822.AddressList instead.
 162.     """
 163. 
 164.     def __init__(self, field):
 165.         """Initialize a new instance.
 166. 
 167.         `field' is an unparsed address header field, containing
 168.         one or more addresses.
 169.         """
 170.         self.specials = '()<>@,:;.\"[]'
 171.         self.pos = 0
 172.         self.LWS = ' \t'
 173.         self.CR = '\r\n'
 174.         self.atomends = self.specials + self.LWS + self.CR
 175.         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
 176.         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
 177.         # syntax, so allow dots in phrases.
 178.         self.phraseends = self.atomends.replace('.', '')
 179.         self.field = field
 180.         self.commentlist = []
 181. 
 182.     def gotonext(self):
 183.         """Parse up to the start of the next address."""
 184.         while self.pos < len(self.field):
 185.             if self.field[self.pos] in self.LWS + '\n\r':
 186.                 self.pos += 1
 187.             elif self.field[self.pos] == '(':
 188.                 self.commentlist.append(self.getcomment())
 189.             else:
 190.                 break
 191. 
 192.     def getaddrlist(self):
 193.         """Parse all addresses.
 194. 
 195.         Returns a list containing all of the addresses.
 196.         """
 197.         result = []
 198.         while self.pos < len(self.field):
 199.             ad = self.getaddress()
 200.             if ad:
 201.                 result += ad
 202.             else:
 203.                 result.append(('', ''))
 204.         return result
 205. 
 206.     def getaddress(self):
 207.         """Parse the next address."""
 208.         self.commentlist = []
 209.         self.gotonext()
 210. 
 211.         oldpos = self.pos
 212.         oldcl = self.commentlist
 213.         plist = self.getphraselist()
 214. 
 215.         self.gotonext()
 216.         returnlist = []
 217. 
 218.         if self.pos >= len(self.field):
 219.             # Bad email address technically, no domain.
 220.             if plist:
 221.                 returnlist = [(SPACE.join(self.commentlist), plist[0])]
 222. 
 223.         elif self.field[self.pos] in '.@':
 224.             # email address is just an addrspec
 225.             # this isn't very efficient since we start over
 226.             self.pos = oldpos
 227.             self.commentlist = oldcl
 228.             addrspec = self.getaddrspec()
 229.             returnlist = [(SPACE.join(self.commentlist), addrspec)]
 230. 
 231.         elif self.field[self.pos] == ':':
 232.             # address is a group
 233.             returnlist = []
 234. 
 235.             fieldlen = len(self.field)
 236.             self.pos += 1
 237.             while self.pos < len(self.field):
 238.                 self.gotonext()
 239.                 if self.pos < fieldlen and self.field[self.pos] == ';':
 240.                     self.pos += 1
 241.                     break
 242.                 returnlist = returnlist + self.getaddress()
 243. 
 244.         elif self.field[self.pos] == '<':
 245.             # Address is a phrase then a route addr
 246.             routeaddr = self.getrouteaddr()
 247. 
 248.             if self.commentlist:
 249.                 returnlist = [(SPACE.join(plist) + ' (' +
 250.                                ' '.join(self.commentlist) + ')', routeaddr)]
 251.             else:
 252.                 returnlist = [(SPACE.join(plist), routeaddr)]
 253. 
 254.         else:
 255.             if plist:
 256.                 returnlist = [(SPACE.join(self.commentlist), plist[0])]
 257.             elif self.field[self.pos] in self.specials:
 258.                 self.pos += 1
 259. 
 260.         self.gotonext()
 261.         if self.pos < len(self.field) and self.field[self.pos] == ',':
 262.             self.pos += 1
 263.         return returnlist
 264. 
 265.     def getrouteaddr(self):
 266.         """Parse a route address (Return-path value).
 267. 
 268.         This method just skips all the route stuff and returns the addrspec.
 269.         """
 270.         if self.field[self.pos] != '<':
 271.             return
 272. 
 273.         expectroute = False
 274.         self.pos += 1
 275.         self.gotonext()
 276.         adlist = ''
 277.         while self.pos < len(self.field):
 278.             if expectroute:
 279.                 self.getdomain()
 280.                 expectroute = False
 281.             elif self.field[self.pos] == '>':
 282.                 self.pos += 1
 283.                 break
 284.             elif self.field[self.pos] == '@':
 285.                 self.pos += 1
 286.                 expectroute = True
 287.             elif self.field[self.pos] == ':':
 288.                 self.pos += 1
 289.             else:
 290.                 adlist = self.getaddrspec()
 291.                 self.pos += 1
 292.                 break
 293.             self.gotonext()
 294. 
 295.         return adlist
 296. 
 297.     def getaddrspec(self):
 298.         """Parse an RFC 2822 addr-spec."""
 299.         aslist = []
 300. 
 301.         self.gotonext()
 302.         while self.pos < len(self.field):
 303.             if self.field[self.pos] == '.':
 304.                 aslist.append('.')
 305.                 self.pos += 1
 306.             elif self.field[self.pos] == '"':
 307.                 aslist.append('"%s"' % self.getquote())
 308.             elif self.field[self.pos] in self.atomends:
 309.                 break
 310.             else:
 311.                 aslist.append(self.getatom())
 312.             self.gotonext()
 313. 
 314.         if self.pos >= len(self.field) or self.field[self.pos] != '@':
 315.             return EMPTYSTRING.join(aslist)
 316. 
 317.         aslist.append('@')
 318.         self.pos += 1
 319.         self.gotonext()
 320.         return EMPTYSTRING.join(aslist) + self.getdomain()
 321. 
 322.     def getdomain(self):
 323.         """Get the complete domain name from an address."""
 324.         sdlist = []
 325.         while self.pos < len(self.field):
 326.             if self.field[self.pos] in self.LWS:
 327.                 self.pos += 1
 328.             elif self.field[self.pos] == '(':
 329.                 self.commentlist.append(self.getcomment())
 330.             elif self.field[self.pos] == '[':
 331.                 sdlist.append(self.getdomainliteral())
 332.             elif self.field[self.pos] == '.':
 333.                 self.pos += 1
 334.                 sdlist.append('.')
 335.             elif self.field[self.pos] in self.atomends:
 336.                 break
 337.             else:
 338.                 sdlist.append(self.getatom())
 339.         return EMPTYSTRING.join(sdlist)
 340. 
 341.     def getdelimited(self, beginchar, endchars, allowcomments=True):
 342.         """Parse a header fragment delimited by special characters.
 343. 
 344.         `beginchar' is the start character for the fragment.
 345.         If self is not looking at an instance of `beginchar' then
 346.         getdelimited returns the empty string.
 347. 
 348.         `endchars' is a sequence of allowable end-delimiting characters.
 349.         Parsing stops when one of these is encountered.
 350. 
 351.         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
 352.         within the parsed fragment.
 353.         """
 354.         if self.field[self.pos] != beginchar:
 355.             return ''
 356. 
 357.         slist = ['']
 358.         quote = False
 359.         self.pos += 1
 360.         while self.pos < len(self.field):
 361.             if quote:
 362.                 slist.append(self.field[self.pos])
 363.                 quote = False
 364.             elif self.field[self.pos] in endchars:
 365.                 self.pos += 1
 366.                 break
 367.             elif allowcomments and self.field[self.pos] == '(':
 368.                 slist.append(self.getcomment())
 369.             elif self.field[self.pos] == '\\':
 370.                 quote = True
 371.             else:
 372.                 slist.append(self.field[self.pos])
 373.             self.pos += 1
 374. 
 375.         return EMPTYSTRING.join(slist)
 376. 
 377.     def getquote(self):
 378.         """Get a quote-delimited fragment from self's field."""
 379.         return self.getdelimited('"', '"\r', False)
 380. 
 381.     def getcomment(self):
 382.         """Get a parenthesis-delimited fragment from self's field."""
 383.         return self.getdelimited('(', ')\r', True)
 384. 
 385.     def getdomainliteral(self):
 386.         """Parse an RFC 2822 domain-literal."""
 387.         return '[%s]' % self.getdelimited('[', ']\r', False)
 388. 
 389.     def getatom(self, atomends=None):
 390.         """Parse an RFC 2822 atom.
 391. 
 392.         Optional atomends specifies a different set of end token delimiters
 393.         (the default is to use self.atomends).  This is used e.g. in
 394.         getphraselist() since phrase endings must not include the `.' (which
 395.         is legal in phrases)."""
 396.         atomlist = ['']
 397.         if atomends is None:
 398.             atomends = self.atomends
 399. 
 400.         while self.pos < len(self.field):
 401.             if self.field[self.pos] in atomends:
 402.                 break
 403.             else:
 404.                 atomlist.append(self.field[self.pos])
 405.             self.pos += 1
 406. 
 407.         return EMPTYSTRING.join(atomlist)
 408. 
 409.     def getphraselist(self):
 410.         """Parse a sequence of RFC 2822 phrases.
 411. 
 412.         A phrase is a sequence of words, which are in turn either RFC 2822
 413.         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
 414.         runs of continuous whitespace into one space.
 415.         """
 416.         plist = []
 417. 
 418.         while self.pos < len(self.field):
 419.             if self.field[self.pos] in self.LWS:
 420.                 self.pos += 1
 421.             elif self.field[self.pos] == '"':
 422.                 plist.append(self.getquote())
 423.             elif self.field[self.pos] == '(':
 424.                 self.commentlist.append(self.getcomment())
 425.             elif self.field[self.pos] in self.phraseends:
 426.                 break
 427.             else:
 428.                 plist.append(self.getatom(self.phraseends))
 429. 
 430.         return plist
 431. 
 432. class AddressList(AddrlistClass):
 433.     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
 434.     def __init__(self, field):
 435.         AddrlistClass.__init__(self, field)
 436.         if field:
 437.             self.addresslist = self.getaddrlist()
 438.         else:
 439.             self.addresslist = []
 440. 
 441.     def __len__(self):
 442.         return len(self.addresslist)
 443. 
 444.     def __add__(self, other):
 445.         # Set union
 446.         newaddr = AddressList(None)
 447.         newaddr.addresslist = self.addresslist[:]
 448.         for x in other.addresslist:
 449.             if not x in self.addresslist:
 450.                 newaddr.addresslist.append(x)
 451.         return newaddr
 452. 
 453.     def __iadd__(self, other):
 454.         # Set union, in-place
 455.         for x in other.addresslist:
 456.             if not x in self.addresslist:
 457.                 self.addresslist.append(x)
 458.         return self
 459. 
 460.     def __sub__(self, other):
 461.         # Set difference
 462.         newaddr = AddressList(None)
 463.         for x in self.addresslist:
 464.             if not x in other.addresslist:
 465.                 newaddr.addresslist.append(x)
 466.         return newaddr
 467. 
 468.     def __isub__(self, other):
 469.         # Set difference, in-place
 470.         for x in other.addresslist:
 471.             if x in self.addresslist:
 472.                 self.addresslist.remove(x)
 473.         return self
 474. 
 475.     def __getitem__(self, index):
 476.         # Make indexing, slices, and 'in' work
 477.         return self.addresslist[index]