source file: /System/Library/Frameworks/Python.framework/Versions/2.3/lib/python2.3/urlparse.py
file stats: 163 lines, 42 executed: 25.8% covered
1. """Parse (absolute and relative) URLs. 2. 3. See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, 4. UC Irvine, June 1995. 5. """ 6. 7. __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 8. "urlsplit", "urlunsplit"] 9. 10. # A classification of schemes ('' means apply by default) 11. uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', 12. 'wais', 'file', 'https', 'shttp', 'mms', 13. 'prospero', 'rtsp', 'rtspu', ''] 14. uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 15. 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 16. 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', ''] 17. non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 18. 'telnet', 'wais', 'imap', 'snews', 'sip'] 19. uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', 20. 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 21. 'mms', ''] 22. uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', 23. 'gopher', 'rtsp', 'rtspu', 'sip', ''] 24. uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 25. 'nntp', 'wais', 'https', 'shttp', 'snews', 26. 'file', 'prospero', ''] 27. 28. # Characters valid in scheme names 29. scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 30. 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 31. '0123456789' 32. '+-.') 33. 34. MAX_CACHE_SIZE = 20 35. _parse_cache = {} 36. 37. def clear_cache(): 38. """Clear the parse cache.""" 39. global _parse_cache 40. _parse_cache = {} 41. 42. 43. def urlparse(url, scheme='', allow_fragments=1): 44. """Parse a URL into 6 components: 45. <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 46. Return a 6-tuple: (scheme, netloc, path, params, query, fragment). 47. Note that we don't break the components up in smaller bits 48. (e.g. netloc is a single string) and we don't expand % escapes.""" 49. tuple = urlsplit(url, scheme, allow_fragments) 50. scheme, netloc, url, query, fragment = tuple 51. if scheme in uses_params and ';' in url: 52. url, params = _splitparams(url) 53. else: 54. params = '' 55. return scheme, netloc, url, params, query, fragment 56. 57. def _splitparams(url): 58. if '/' in url: 59. i = url.find(';', url.rfind('/')) 60. if i < 0: 61. return url, '' 62. else: 63. i = url.find(';') 64. return url[:i], url[i+1:] 65. 66. def urlsplit(url, scheme='', allow_fragments=1): 67. """Parse a URL into 5 components: 68. <scheme>://<netloc>/<path>?<query>#<fragment> 69. Return a 5-tuple: (scheme, netloc, path, query, fragment). 70. Note that we don't break the components up in smaller bits 71. (e.g. netloc is a single string) and we don't expand % escapes.""" 72. key = url, scheme, allow_fragments 73. cached = _parse_cache.get(key, None) 74. if cached: 75. return cached 76. if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 77. clear_cache() 78. netloc = query = fragment = '' 79. i = url.find(':') 80. if i > 0: 81. if url[:i] == 'http': # optimize the common case 82. scheme = url[:i].lower() 83. url = url[i+1:] 84. if url[:2] == '//': 85. i = url.find('/', 2) 86. if i < 0: 87. i = url.find('#') 88. if i < 0: 89. i = len(url) 90. netloc = url[2:i] 91. url = url[i:] 92. if allow_fragments and '#' in url: 93. url, fragment = url.split('#', 1) 94. if '?' in url: 95. url, query = url.split('?', 1) 96. tuple = scheme, netloc, url, query, fragment 97. _parse_cache[key] = tuple 98. return tuple 99. for c in url[:i]: 100. if c not in scheme_chars: 101. break 102. else: 103. scheme, url = url[:i].lower(), url[i+1:] 104. if scheme in uses_netloc: 105. if url[:2] == '//': 106. i = url.find('/', 2) 107. if i < 0: 108. i = len(url) 109. netloc, url = url[2:i], url[i:] 110. if allow_fragments and scheme in uses_fragment and '#' in url: 111. url, fragment = url.split('#', 1) 112. if scheme in uses_query and '?' in url: 113. url, query = url.split('?', 1) 114. tuple = scheme, netloc, url, query, fragment 115. _parse_cache[key] = tuple 116. return tuple 117. 118. def urlunparse((scheme, netloc, url, params, query, fragment)): 119. """Put a parsed URL back together again. This may result in a 120. slightly different, but equivalent URL, if the URL that was parsed 121. originally had redundant delimiters, e.g. a ? with an empty query 122. (the draft states that these are equivalent).""" 123. if params: 124. url = "%s;%s" % (url, params) 125. return urlunsplit((scheme, netloc, url, query, fragment)) 126. 127. def urlunsplit((scheme, netloc, url, query, fragment)): 128. if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 129. if url and url[:1] != '/': url = '/' + url 130. url = '//' + (netloc or '') + url 131. if scheme: 132. url = scheme + ':' + url 133. if query: 134. url = url + '?' + query 135. if fragment: 136. url = url + '#' + fragment 137. return url 138. 139. def urljoin(base, url, allow_fragments = 1): 140. """Join a base URL and a possibly relative URL to form an absolute 141. interpretation of the latter.""" 142. if not base: 143. return url 144. if not url: 145. return base 146. bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 147. urlparse(base, '', allow_fragments) 148. scheme, netloc, path, params, query, fragment = \ 149. urlparse(url, bscheme, allow_fragments) 150. if scheme != bscheme or scheme not in uses_relative: 151. return url 152. if scheme in uses_netloc: 153. if netloc: 154. return urlunparse((scheme, netloc, path, 155. params, query, fragment)) 156. netloc = bnetloc 157. if path[:1] == '/': 158. return urlunparse((scheme, netloc, path, 159. params, query, fragment)) 160. if not path: 161. if not params: 162. params = bparams 163. if not query: 164. query = bquery 165. return urlunparse((scheme, netloc, bpath, 166. params, query, fragment)) 167. segments = bpath.split('/')[:-1] + path.split('/') 168. # XXX The stuff below is bogus in various ways... 169. if segments[-1] == '.': 170. segments[-1] = '' 171. while '.' in segments: 172. segments.remove('.') 173. while 1: 174. i = 1 175. n = len(segments) - 1 176. while i < n: 177. if (segments[i] == '..' 178. and segments[i-1] not in ('', '..')): 179. del segments[i-1:i+1] 180. break 181. i = i+1 182. else: 183. break 184. if segments == ['', '..']: 185. segments[-1] = '' 186. elif len(segments) >= 2 and segments[-1] == '..': 187. segments[-2:] = [''] 188. return urlunparse((scheme, netloc, '/'.join(segments), 189. params, query, fragment)) 190. 191. def urldefrag(url): 192. """Removes any existing fragment from URL. 193. 194. Returns a tuple of the defragmented URL and the fragment. If 195. the URL contained no fragments, the second element is the 196. empty string. 197. """ 198. if '#' in url: 199. s, n, p, a, q, frag = urlparse(url) 200. defrag = urlunparse((s, n, p, a, q, '')) 201. return defrag, frag 202. else: 203. return url, '' 204. 205. 206. test_input = """ 207. http://a/b/c/d 208. 209. g:h = <URL:g:h> 210. http:g = <URL:http://a/b/c/g> 211. http: = <URL:http://a/b/c/d> 212. g = <URL:http://a/b/c/g> 213. ./g = <URL:http://a/b/c/g> 214. g/ = <URL:http://a/b/c/g/> 215. /g = <URL:http://a/g> 216. //g = <URL:http://g> 217. ?y = <URL:http://a/b/c/d?y> 218. g?y = <URL:http://a/b/c/g?y> 219. g?y/./x = <URL:http://a/b/c/g?y/./x> 220. . = <URL:http://a/b/c/> 221. ./ = <URL:http://a/b/c/> 222. .. = <URL:http://a/b/> 223. ../ = <URL:http://a/b/> 224. ../g = <URL:http://a/b/g> 225. ../.. = <URL:http://a/> 226. ../../g = <URL:http://a/g> 227. ../../../g = <URL:http://a/../g> 228. ./../g = <URL:http://a/b/g> 229. ./g/. = <URL:http://a/b/c/g/> 230. /./g = <URL:http://a/./g> 231. g/./h = <URL:http://a/b/c/g/h> 232. g/../h = <URL:http://a/b/c/h> 233. http:g = <URL:http://a/b/c/g> 234. http: = <URL:http://a/b/c/d> 235. http:?y = <URL:http://a/b/c/d?y> 236. http:g?y = <URL:http://a/b/c/g?y> 237. http:g?y/./x = <URL:http://a/b/c/g?y/./x> 238. """ 239. 240. def test(): 241. import sys 242. base = '' 243. if sys.argv[1:]: 244. fn = sys.argv[1] 245. if fn == '-': 246. fp = sys.stdin 247. else: 248. fp = open(fn) 249. else: 250. import StringIO 251. fp = StringIO.StringIO(test_input) 252. while 1: 253. line = fp.readline() 254. if not line: break 255. words = line.split() 256. if not words: 257. continue 258. url = words[0] 259. parts = urlparse(url) 260. print '%-10s : %s' % (url, parts) 261. abs = urljoin(base, url) 262. if not base: 263. base = abs 264. wrapped = '<URL:%s>' % abs 265. print '%-10s = %s' % (url, wrapped) 266. if len(words) == 3 and words[1] == '=': 267. if wrapped != words[2]: 268. print 'EXPECTED', words[2], '!!!!!!!!!!' 269. 270. if __name__ == '__main__': 271. test()