Wrestling HTML

<body>

Go <a class="that" href="here.html"><i>here</i></a>

or <i>go <b><a href="index.html">Home</a>

<!--noncetag>spam</noncetag><!--eggs-->

</html>  
import tidy

import sys


def tidy2xhtml(instream, outstream):

    options = dict(output_xhtml=1,

                   add_xml_decl=1,

                   indent=1

                   )

    tidied = tidy.parseString(instream.read(), **options)

    tidied.write(outstream)

    return


doc = open(sys.argv[1])


tidy2xhtml(doc, sys.stdout)  
<?xml version="1.0"?>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"

    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">

  <head>

    <meta name="generator" content=

    "HTML Tidy for Linux/x86 (vers 1st August 2004), see www.w3.org" />

    <title></title>

  </head>

  <body>

    Go <a class="that" href="here.html"><i>here</i></a> or <i>go

    <b><a href="index.html">Home</a>

    <!--noncetag>spam</noncetag><!==eggs--></b></i>

  </body>

</html>  
<li>

  <a href="hwht01.htm" accesskey="1">Section 1</a> : 

HTML &Scaron;&icirc;&lsquo;b&sbquo;&Igrave;&Scaron;&icirc;&lsquo;b

</li>
import tidy

import sys


def tidy2xhtml(instream, outstream):

    options = dict(output_xhtml=1,

                   add_xml_decl=1,

                   indent=1,

                   output_encoding='utf8',

                   input_encoding=encoding

                   )

    tidied = tidy.parseString(instream.read(), **options)

    tidied.write(outstream)

    return


doc = open(sys.argv[1])

try:

    encoding = sys.argv[2]

except:

    encoding = 'latin1'


tidy2xhtml(doc, sys.stdout)  
>>> import libxml2

>>> #Again seems to require the full string

>>> source = open('listing1.html').read()

>>> hdoc = libxml2.htmlParseDoc(source, None)

HTML parser error : Opening and ending tag mismatch: html and b

</html>

       ^

HTML parser error : Opening and ending tag mismatch: html and i

</html>

       ^  
>>> print hdoc

/usr/lib/python2.3/site-packages/libxml2.py:3597: \

FutureWarning: %u/%o/%x/%X of negative int will \

return a signed string in Python 2.4 and up

  return "<xmlDoc (%s) object at 0x%x>" % (self.name, id (self))

<xmlDoc (None) object at 0xf7032bcc>  
>>> print hdoc.serialize()

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"

"http://www.w3.org/TR/REC-html40/loose.dtd">

<html><body><p>

Go <a class="that" href="here.html"><i>here</i></a>

or <i>go <b><a href="index.html">Home</a>

<!--noncetag>spam</noncetag><!--eggs-->

</b></i></p></body></html>

 
>>> hdoc.freeDoc()  
>>> uri = 'http://www.tg.rim.or.jp/~hexane/ach/hwht/'

>>> hdoc = libxml2.htmlParseFile(uri, None)

Wrestling HTML

Listing 1: An Example of Bad HTML

uTidyLib

Listing 2: uTidyLib Program to Convert HTML to XHTML

Figure 1: Sample of English and Japanese Text from Valid HTML Document

Figure 2: Sample of English and Japanese Text from Valid HTML Doc after Mangling by uTidyLib

Listing 3: uTidyLib Program to Convert HTML to XHTML Using Specified Encodings

libxml2's HTML Parser

Wrap Up

News and Notes