source file: /Library/Python/2.3/site-packages/CherryPy-3.0.1-py2.3.egg/cherrypy/lib/tidy.py
file stats: 122 lines, 11 executed: 9.0% covered
1. """Functions to run cherrypy.response through Tidy or NSGML.""" 2. 3. import cgi 4. import os 5. import StringIO 6. import traceback 7. 8. import cherrypy 9. 10. def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None, 11. indent=False, wrap=False, warnings=True): 12. """Run cherrypy.response through Tidy. 13. 14. If either 'indent' or 'wrap' are specified, then response.body will be 15. set to the output of tidy. Otherwise, only errors (including warnings, 16. if warnings is True) will change the body. 17. 18. Note that we use the standalone Tidy tool rather than the python 19. mxTidy module. This is because this module does not seem to be 20. stable and it crashes on some HTML pages (which means that the 21. server would also crash) 22. """ 23. response = cherrypy.response 24. 25. # the tidy tool, by its very nature it's not generator friendly, 26. # so we just collapse the body and work with it. 27. orig_body = response.collapse_body() 28. 29. fct = response.headers.get('Content-Type', '') 30. ct = fct.split(';')[0] 31. encoding = '' 32. i = fct.find('charset=') 33. if i != -1: 34. encoding = fct[i + 8:] 35. 36. if ct == 'text/html': 37. page_file = os.path.join(temp_dir, 'page.html') 38. open(page_file, 'wb').write(orig_body) 39. 40. out_file = os.path.join(temp_dir, 'tidy.out') 41. err_file = os.path.join(temp_dir, 'tidy.err') 42. tidy_enc = encoding.replace('-', '') 43. if tidy_enc: 44. tidy_enc = '-' + tidy_enc 45. 46. strict_xml = ("", " -xml")[bool(strict_xml)] 47. 48. if indent: 49. indent = ' -indent' 50. else: 51. indent = '' 52. 53. if wrap is False: 54. wrap = '' 55. else: 56. try: 57. wrap = ' -wrap %d' % int(tidyWrap) 58. except: 59. wrap = '' 60. 61. result = os.system('"%s" %s%s%s%s -f %s -o %s %s' % 62. (tidy_path, tidy_enc, strict_xml, indent, wrap, 63. err_file, out_file, page_file)) 64. use_output = bool(indent or wrap) and not result 65. if use_output: 66. output = open(out_file, 'rb').read() 67. 68. new_errs = [] 69. for err in open(err_file, 'rb').read().splitlines(): 70. if (err.find('Error') != -1 or 71. (warnings and err.find('Warning') != -1)): 72. ignore = 0 73. for err_ign in errors_to_ignore or []: 74. if err.find(err_ign) != -1: 75. ignore = 1 76. break 77. if not ignore: 78. new_errs.append(err) 79. 80. if new_errs: 81. response.body = wrong_content('<br />'.join(new_errs), orig_body) 82. if response.headers.has_key("Content-Length"): 83. # Delete Content-Length header so finalize() recalcs it. 84. del response.headers["Content-Length"] 85. return 86. elif strict_xml: 87. # The HTML is OK, but is it valid XML? 88. # Use elementtree to parse XML 89. from elementtree.ElementTree import parse 90. tag_list = ['nbsp', 'quot'] 91. for tag in tag_list: 92. orig_body = orig_body.replace('&' + tag + ';', tag.upper()) 93. 94. if encoding: 95. enctag = '<?xml version="1.0" encoding="%s"?>' % encoding 96. orig_body = enctag + orig_body 97. 98. f = StringIO.StringIO(orig_body) 99. try: 100. tree = parse(f) 101. except: 102. # Wrong XML 103. body_file = StringIO.StringIO() 104. traceback.print_exc(file = body_file) 105. body_file = '<br />'.join(body_file.getvalue()) 106. response.body = wrong_content(body_file, orig_body, "XML") 107. if response.headers.has_key("Content-Length"): 108. # Delete Content-Length header so finalize() recalcs it. 109. del response.headers["Content-Length"] 110. return 111. 112. if use_output: 113. response.body = [output] 114. if response.headers.has_key("Content-Length"): 115. # Delete Content-Length header so finalize() recalcs it. 116. del response.headers["Content-Length"] 117. 118. def html_space(text): 119. """Escape text, replacing space with nbsp and tab with 4 nbsp's.""" 120. return cgi.escape(text).replace('\t', ' ').replace(' ', ' ') 121. 122. def html_break(text): 123. """Escape text, replacing newline with HTML br element.""" 124. return cgi.escape(text).replace('\n', '<br />') 125. 126. def wrong_content(header, body, content_type="HTML"): 127. output = ["Wrong %s:<br />%s<br />" % (content_type, html_break(header))] 128. for i, line in enumerate(body.splitlines()): 129. output.append("%03d - %s" % (i + 1, html_space(line))) 130. return "<br />".join(output) 131. 132. 133. def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None): 134. response = cherrypy.response 135. 136. # the tidy tool, by its very nature it's not generator friendly, 137. # so we just collect the body and work with it. 138. orig_body = response.collapse_body() 139. 140. fct = response.headers.get('Content-Type', '') 141. ct = fct.split(';')[0] 142. encoding = '' 143. i = fct.find('charset=') 144. if i != -1: 145. encoding = fct[i + 8:] 146. if ct == 'text/html': 147. # Remove bits of Javascript (nsgmls doesn't seem to handle 148. # them correctly (for instance, if <a appears in your 149. # Javascript code nsgmls complains about it) 150. while True: 151. i = orig_body.find('<script') 152. if i == -1: 153. break 154. j = orig_body.find('</script>', i) 155. if j == -1: 156. break 157. orig_body = orig_body[:i] + orig_body[j+9:] 158. 159. page_file = os.path.join(temp_dir, 'page.html') 160. open(page_file, 'wb').write(orig_body) 161. 162. err_file = os.path.join(temp_dir, 'nsgmls.err') 163. command = ('%s -c%s -f%s -s -E10 %s' % 164. (nsgmls_path, catalog_path, err_file, page_file)) 165. command = command.replace('\\', '/') 166. os.system(command) 167. errs = open(err_file, 'rb').read() 168. 169. new_errs = [] 170. for err in errs.splitlines(): 171. ignore = False 172. for err_ign in errors_to_ignore or []: 173. if err.find(err_ign) != -1: 174. ignore = True 175. break 176. if not ignore: 177. new_errs.append(err) 178. 179. if new_errs: 180. response.body = wrong_content('<br />'.join(new_errs), orig_body) 181. if response.headers.has_key("Content-Length"): 182. # Delete Content-Length header so finalize() recalcs it. 183. del response.headers["Content-Length"] 184.