/ scripts / htmldiff / htmldiff
htmldiff
 1  #!/usr/bin/python
 2  #
 3  # Modified from the htmldiff script developed by Dominique HazaC+l-Massieux
 4  # for the http://services.w3.org/htmldiff website. That script did not
 5  # include a copyright statement.
 6  
 7  import atexit
 8  import os
 9  import re
10  import sys
11  import tempfile
12  import tidy
13  
14  from subprocess import Popen, PIPE
15  
16  def tidyFile(filename):
17      ifp = open(filename, 'r')
18  
19      # option for tidy
20      options = dict(tidy_mark=0,show_warnings=0,quiet=1,char_encoding='utf8')
21      html5 = re.search(r"<!doctype\s+html\s*>", ifp.read(4096),
22                        re.IGNORECASE)
23      ifp.seek(0)
24      html5_options = {'add_xml_space': 'no',
25                       'output_xhtml': 'no',
26                       'tidy_mark': 'no',
27                       'new_blocklevel_tags': 'article,aside,canvas,dialog,details,figcaption,figure,footer,header,hgroup,menu,nav,section,main,summary,math,semantics,mrow,mfenced,mtable,mtr,mtd,mi,mn,msub,mo,mfrac,munderover,mtext,svg,g,image,rect,text,desc,line,path,polygon,ellipse,tspan,defs,feoffset,fecolormatrix,filter,fegaussianblur,feblend,marker,circle',
28                       'new_inline_tags': 'video,audio,canvas,ruby,rt,rp,time,meter,progress,track,source,emu-val,emu-nt,emu-t,mark',
29                       'break_before_br': 'no',
30                       'vertical_space': 'no',
31                       'enclose_text': 'no',
32                       'numeric_entities': 'yes',
33                       'wrap': '1000',
34                       'wrap_attributes': 'no',
35                       'drop_empty_paras': 'no'
36                       }
37      if html5:
38          options.update(html5_options)
39      newtidy = tidy.parseString(ifp.read(), **options)
40      if len(newtidy.errors) > 0:
41          if not html5:
42              ifp.seek(0)
43              options.update(html5_options)
44              newtidy = tidy.parseString(ifp.read(), **options)
45      ifp.close()
46  
47      fp = tempfile.NamedTemporaryFile(
48             mode='w+', prefix='htmldiff-', suffix='.html')
49      atexit.register(fp.close)
50      fp.write(str(newtidy))
51      fp.flush()
52      fp.seek(0)
53  
54      # sys.stderr.write('tidyFile: tempfile name %s\n' % fp.name)
55  
56      if (newtidy.errors):
57          sys.stderr.write('tidyFile: tidy.parseString error: %s\n' % str(newtidy.errors))
58      return fp
59  
60  if __name__ == '__main__':
61      if (len(sys.argv) < 3):
62          sys.stderr.write('tidy: need args file1 file2\n')
63          sys.exit(1)
64  
65      refdoc = tidyFile(sys.argv[1])
66      newdoc = tidyFile(sys.argv[2])
67  
68      scriptdir = os.path.abspath(os.path.dirname(sys.argv[0]))
69      perlscript = os.path.join(scriptdir, 'htmldiff.pl')
70  
71      p = Popen([perlscript, refdoc.name, newdoc.name],
72                stdin=PIPE, stdout=PIPE, stderr=PIPE)
73      sys.stdout.flush()
74      sys.stderr.flush()
75      (out, err) = p.communicate()
76      p.stdin.close()
77      if err:
78          sys.stderr.write('htmldiff: An error occured when running htmldiff.pl on the documents:', str(err))
79          exit(1)
80      else:
81          print(out)