source file: /Library/Python/2.3/site-packages/CherryPy-3.0.1-py2.3.egg/cherrypy/lib/tidy.py
file stats: 122 lines, 11 executed: 9.0% covered
   1. """Functions to run cherrypy.response through Tidy or NSGML."""
   2. 
   3. import cgi
   4. import os
   5. import StringIO
   6. import traceback
   7. 
   8. import cherrypy
   9. 
  10. def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None,
  11.          indent=False, wrap=False, warnings=True):
  12.     """Run cherrypy.response through Tidy.
  13. 
  14.     If either 'indent' or 'wrap' are specified, then response.body will be
  15.     set to the output of tidy. Otherwise, only errors (including warnings,
  16.     if warnings is True) will change the body.
  17. 
  18.     Note that we use the standalone Tidy tool rather than the python
  19.     mxTidy module. This is because this module does not seem to be
  20.     stable and it crashes on some HTML pages (which means that the
  21.     server would also crash)
  22.     """
  23.     response = cherrypy.response
  24. 
  25.     # the tidy tool, by its very nature it's not generator friendly,
  26.     # so we just collapse the body and work with it.
  27.     orig_body = response.collapse_body()
  28. 
  29.     fct = response.headers.get('Content-Type', '')
  30.     ct = fct.split(';')[0]
  31.     encoding = ''
  32.     i = fct.find('charset=')
  33.     if i != -1:
  34.         encoding = fct[i + 8:]
  35. 
  36.     if ct == 'text/html':
  37.         page_file = os.path.join(temp_dir, 'page.html')
  38.         open(page_file, 'wb').write(orig_body)
  39. 
  40.         out_file = os.path.join(temp_dir, 'tidy.out')
  41.         err_file = os.path.join(temp_dir, 'tidy.err')
  42.         tidy_enc = encoding.replace('-', '')
  43.         if tidy_enc:
  44.             tidy_enc = '-' + tidy_enc
  45. 
  46.         strict_xml = ("", " -xml")[bool(strict_xml)]
  47. 
  48.         if indent:
  49.             indent = ' -indent'
  50.         else:
  51.             indent = ''
  52. 
  53.         if wrap is False:
  54.             wrap = ''
  55.         else:
  56.             try:
  57.                 wrap = ' -wrap %d' % int(tidyWrap)
  58.             except:
  59.                 wrap = ''
  60. 
  61.         result = os.system('"%s" %s%s%s%s -f %s -o %s %s' %
  62.                            (tidy_path, tidy_enc, strict_xml, indent, wrap,
  63.                             err_file, out_file, page_file))
  64.         use_output = bool(indent or wrap) and not result
  65.         if use_output:
  66.             output = open(out_file, 'rb').read()
  67. 
  68.         new_errs = []
  69.         for err in open(err_file, 'rb').read().splitlines():
  70.             if (err.find('Error') != -1 or
  71.                 (warnings and err.find('Warning') != -1)):
  72.                 ignore = 0
  73.                 for err_ign in errors_to_ignore or []:
  74.                     if err.find(err_ign) != -1:
  75.                         ignore = 1
  76.                         break
  77.                 if not ignore:
  78.                     new_errs.append(err)
  79. 
  80.         if new_errs:
  81.             response.body = wrong_content('<br />'.join(new_errs), orig_body)
  82.             if response.headers.has_key("Content-Length"):
  83.                 # Delete Content-Length header so finalize() recalcs it.
  84.                 del response.headers["Content-Length"]
  85.             return
  86.         elif strict_xml:
  87.             # The HTML is OK, but is it valid XML?
  88.             # Use elementtree to parse XML
  89.             from elementtree.ElementTree import parse
  90.             tag_list = ['nbsp', 'quot']
  91.             for tag in tag_list:
  92.                 orig_body = orig_body.replace('&' + tag + ';', tag.upper())
  93. 
  94.             if encoding:
  95.                 enctag = '<?xml version="1.0" encoding="%s"?>' % encoding
  96.                 orig_body = enctag + orig_body
  97. 
  98.             f = StringIO.StringIO(orig_body)
  99.             try:
 100.                 tree = parse(f)
 101.             except:
 102.                 # Wrong XML
 103.                 body_file = StringIO.StringIO()
 104.                 traceback.print_exc(file = body_file)
 105.                 body_file = '<br />'.join(body_file.getvalue())
 106.                 response.body = wrong_content(body_file, orig_body, "XML")
 107.                 if response.headers.has_key("Content-Length"):
 108.                     # Delete Content-Length header so finalize() recalcs it.
 109.                     del response.headers["Content-Length"]
 110.                 return
 111. 
 112.         if use_output:
 113.             response.body = [output]
 114.             if response.headers.has_key("Content-Length"):
 115.                 # Delete Content-Length header so finalize() recalcs it.
 116.                 del response.headers["Content-Length"]
 117. 
 118. def html_space(text):
 119.     """Escape text, replacing space with nbsp and tab with 4 nbsp's."""
 120.     return cgi.escape(text).replace('\t', '    ').replace(' ', '&nbsp;')
 121. 
 122. def html_break(text):
 123.     """Escape text, replacing newline with HTML br element."""
 124.     return cgi.escape(text).replace('\n', '<br />')
 125. 
 126. def wrong_content(header, body, content_type="HTML"):
 127.     output = ["Wrong %s:<br />%s<br />" % (content_type, html_break(header))]
 128.     for i, line in enumerate(body.splitlines()):
 129.         output.append("%03d - %s" % (i + 1, html_space(line)))
 130.     return "<br />".join(output)
 131. 
 132. 
 133. def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None):
 134.     response = cherrypy.response
 135. 
 136.     # the tidy tool, by its very nature it's not generator friendly,
 137.     # so we just collect the body and work with it.
 138.     orig_body = response.collapse_body()
 139. 
 140.     fct = response.headers.get('Content-Type', '')
 141.     ct = fct.split(';')[0]
 142.     encoding = ''
 143.     i = fct.find('charset=')
 144.     if i != -1:
 145.         encoding = fct[i + 8:]
 146.     if ct == 'text/html':
 147.         # Remove bits of Javascript (nsgmls doesn't seem to handle
 148.         #   them correctly (for instance, if <a appears in your
 149.         #   Javascript code nsgmls complains about it)
 150.         while True:
 151.             i = orig_body.find('<script')
 152.             if i == -1:
 153.                 break
 154.             j = orig_body.find('</script>', i)
 155.             if j == -1:
 156.                 break
 157.             orig_body = orig_body[:i] + orig_body[j+9:]
 158. 
 159.         page_file = os.path.join(temp_dir, 'page.html')
 160.         open(page_file, 'wb').write(orig_body)
 161. 
 162.         err_file = os.path.join(temp_dir, 'nsgmls.err')
 163.         command = ('%s -c%s -f%s -s -E10 %s' %
 164.                    (nsgmls_path, catalog_path, err_file, page_file))
 165.         command = command.replace('\\', '/')
 166.         os.system(command)
 167.         errs = open(err_file, 'rb').read()
 168. 
 169.         new_errs = []
 170.         for err in errs.splitlines():
 171.             ignore = False
 172.             for err_ign in errors_to_ignore or []:
 173.                 if err.find(err_ign) != -1:
 174.                     ignore = True
 175.                     break
 176.             if not ignore:
 177.                 new_errs.append(err)
 178. 
 179.         if new_errs:
 180.             response.body = wrong_content('<br />'.join(new_errs), orig_body)
 181.             if response.headers.has_key("Content-Length"):
 182.                 # Delete Content-Length header so finalize() recalcs it.
 183.                 del response.headers["Content-Length"]
 184.