diff --git a/filters/html2xhtml.plugin b/filters/html2xhtml.plugin
index 3ab7a8c..3840c43 100644
--- a/filters/html2xhtml.plugin
+++ b/filters/html2xhtml.plugin
@@ -1,6 +1,6 @@
 import sys
 import html5lib
-tree=html5lib.treebuilders.dom.TreeBuilder
+tree=html5lib.treebuilders.getTreeBuilder('dom')
 parser = html5lib.html5parser.HTMLParser(tree=tree)
 document = parser.parse(sys.stdin)
 sys.stdout.write(document.toxml("utf-8"))
diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index bb41cfa..c6930cd 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -16,8 +16,7 @@
 import re, time, sgmllib
 from xml.sax.saxutils import escape
 from xml.dom import minidom, Node
-from html5lib import html5parser
-from html5lib.treebuilders import dom
+from html5lib import html5parser, treebuilders
 import planet, config
 
 try:
@@ -75,8 +74,8 @@ def id(xentry, entry):
         entry_id = entry.link
     elif entry.has_key("title") and entry.title:
         entry_id = (entry.title_detail.base + "/" +
-            md5(entry.title).hexdigest())
-    elif entry.has_key("summary") and entry.summary:
+            md5(entry.title.encode('utf-8')).hexdigest())
+    elif entry.has_key("summary") and entry.summary and entry.has_key("summary_detail") and entry.summary_detail:
         entry_id = (entry.summary_detail.base + "/" +
             md5(entry.summary).hexdigest())
     elif entry.has_key("content") and entry.content:
@@ -168,7 +167,7 @@ def content(xentry, name, detail, bozo):
             bozo=1
 
     if detail.type.find('xhtml')<0 or bozo:
-        parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
+        parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
         html = parser.parse(xdiv % detail.value, encoding="utf-8")
         for body in html.documentElement.childNodes:
             if body.nodeType != Node.ELEMENT_NODE: continue
@@ -208,7 +207,7 @@ def location(xentry, long, lat):
 
     xlat = createTextElement(xentry, '%s:%s' % ('geo','lat'), '%f' % lat)
     xlat.setAttribute('xmlns:%s' % 'geo', 'http://www.w3.org/2003/01/geo/wgs84_pos#')
-    xlong = createTextElement(xentry, '%s:%s' % ('geo','long'), '%f' % long)
+    xlong = createTextElement(xentry, '%s:%s' % ('geo','long'), '%.6f' % long)
     xlong.setAttribute('xmlns:%s' % 'geo', 'http://www.w3.org/2003/01/geo/wgs84_pos#')
 
     xentry.appendChild(xlat)
@@ -305,8 +304,13 @@ def reconstitute(feed, entry):
         coordinates = where.get('coordinates',None)
         if type == 'Point':
             location(xentry, coordinates[0], coordinates[1])
-        elif type == 'Box' or type == 'LineString' or type == 'Polygon':
-            location(xentry, coordinates[0][0], coordinates[0][1])
+        elif type == 'Box' or type == 'LineString': 
+            location(xentry, (coordinates[0][0]+coordinates[1][0])/2.0, (coordinates[0][1]+coordinates[1][1])/2.0)
+        elif type == 'Polygon':
+            vertices = coordinates[0]
+            lats = [row[0] for row in vertices]
+            longs = [row[1] for row in vertices]
+            location(xentry, sum(lats)/float(len(lats)), sum(longs)/float(len(longs)))
     if entry.has_key('geo_lat') and \
         entry.has_key('geo_long'):
         location(xentry, (float)(entry.get('geo_long',None)), (float)(entry.get('geo_lat',None)))
@@ -363,7 +367,8 @@ def reconstitute(feed, entry):
 def entry_updated(feed, entry, default = None):
     chks = ((entry, 'updated_parsed'),
             (entry, 'published_parsed'),
-            (feed,  'updated_parsed'),)
+            (feed,  'updated_parsed'),
+            (feed, 'published_parsed'),)
     for node, field in chks:
         if node.has_key(field) and node[field]:
             return node[field]
diff --git a/planet/scrub.py b/planet/scrub.py
index fef5c22..d66553c 100644
--- a/planet/scrub.py
+++ b/planet/scrub.py
@@ -122,30 +122,30 @@ def scrub(feed_uri, data):
                         if entry.has_key('link'):
                             node['base'] = entry.link
                     else:
-                        node['base'] = feedparser._urljoin(
+                        node['base'] = feedparser.urls._urljoin(
                             node['base'], scrub_xmlbase)
 
-                node['value'] = feedparser._resolveRelativeURIs(
+                node['value'] = feedparser.urls._resolveRelativeURIs(
                     node.value, node.base, 'utf-8', node.type)
 
-            # Run this through HTML5's sanitizer
-            doc = None
-            if 'xhtml' in node['type']:
-              try:
-                from xml.dom import minidom
-                doc = minidom.parseString(node['value'])
-              except:
-                node['type']='text/html'
-
-            if not doc:
-              from html5lib import html5parser, treebuilders
-              p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
-              doc = p.parseFragment(node['value'], encoding='utf-8')
-
-            from html5lib import treewalkers, serializer
-            from html5lib.filters import sanitizer
-            walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
-            xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
-            tree = xhtml.serialize(walker, encoding='utf-8')
-
-            node['value'] = ''.join([str(token) for token in tree])
+            if node['value']:
+                # Run this through HTML5's sanitizer
+                doc = None
+                if 'xhtml' in node['type']:
+                    try:
+                        from xml.dom import minidom
+                        doc = minidom.parseString(node['value'])
+                    except:
+                        node['type']='text/html'
+
+                if not doc:
+                    from html5lib import html5parser, treebuilders, sanitizer
+                    p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'), tokenizer=sanitizer.HTMLSanitizer)
+                    doc = p.parseFragment(node['value'])
+                    # doc = p.parseFragment(node['value'], encoding='utf-8')
+
+                from html5lib import treewalkers, serializer
+                walker = treewalkers.getTreeWalker('dom')(doc)
+                xhtml = serializer.HTMLSerializer(inject_meta_charset = False)
+                tree = xhtml.serialize(walker, encoding='utf-8')
+                node['value'] = ''.join([str(token) for token in tree])
diff --git a/planet/shell/__init__.py b/planet/shell/__init__.py
index a65b121..00e4000 100644
--- a/planet/shell/__init__.py
+++ b/planet/shell/__init__.py
@@ -45,9 +45,9 @@ def run(template_file, doc, mode='template'):
     module_name = ext[1:]
     try:
         try:
-            module = __import__("_" + module_name)
+            module = __import__("planet.shell." + "_" + module_name, "", "", [""])
         except:
-            module = __import__(module_name)
+            module = __import__("planet.shell." + module_name, "", "", [""])
     except Exception, inst:
         return log.error("Skipping %s '%s' after failing to load '%s': %s", 
             mode, template_resolved, module_name, inst)
diff --git a/planet/shell/dj.py b/planet/shell/dj.py
index d2199fc..970e252 100644
--- a/planet/shell/dj.py
+++ b/planet/shell/dj.py
@@ -19,13 +19,12 @@ def run(script, doc, output_file=None, options={}):
     # I need to re-import the settings at every call because I have to 
     # set the TEMPLATE_DIRS variable programmatically
     from django.conf import settings
-    settings._wrapped=None
     try:
         settings.configure(
             DEBUG=True, TEMPLATE_DEBUG=True, 
             TEMPLATE_DIRS=(os.path.dirname(script),)
             )
-    except EnvironmentError:
+    except RuntimeError:
         pass
     from django.template import Context
     from django.template.loader import get_template
diff --git a/planet/shell/tmpl.py b/planet/shell/tmpl.py
index 04eb20a..78386ee 100644
--- a/planet/shell/tmpl.py
+++ b/planet/shell/tmpl.py
@@ -2,8 +2,10 @@
 import sgmllib, time, os, sys, new, urlparse, re
 from planet import config, feedparser
 import htmltmpl
+import datetime
 
-voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
+# voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
+voids=[]
 empty=re.compile(r"<((%s)[^>]*)></\2>" % '|'.join(voids))
 
 class stripHtml(sgmllib.SGMLParser):
@@ -186,7 +188,7 @@ def template_info(source):
     """ get template information from a feedparser output """
 
     # wire in support for planet:source, call feedparser, unplug planet:source
-    mixin=feedparser._FeedParserMixin
+    mixin=feedparser.mixin._FeedParserMixin
     mixin._start_planet_source = mixin._start_source
     mixin._end_planet_source = \
         new.instancemethod(_end_planet_source, None, mixin)
@@ -253,6 +255,8 @@ def run(script, doc, output_file=None, options={}):
     template = manager.prepare(script)
     tp = htmltmpl.TemplateProcessor(html_escape=0)
     for key,value in template_info(doc).items():
+        if type(value) == datetime.datetime:
+            value = value.isoformat()
         tp.set(key, value)
 
     if output_file:
diff --git a/planet/spider.py b/planet/spider.py
index 50d1739..08d56b9 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -45,10 +45,10 @@ def filename(directory, filename):
     filename = re_final_cruft.sub("", filename)
 
     # limit length of filename
-    if len(filename)>250:
+    if len(filename)>230:
         parts=filename.split(',')
         for i in range(len(parts),0,-1):
-            if len(','.join(parts[:i])) < 220:
+            if len(','.join(parts[:i])) < 200:
                 filename = ','.join(parts[:i]) + ',' + \
                     md5(','.join(parts[i:])).hexdigest()
                 break
@@ -107,7 +107,7 @@ def writeCache(feed_uri, feed_info, data):
         if not feed_info.feed.has_key('planet_message'):
             if feed_info.feed.has_key('planet_updated'):
                 updated = feed_info.feed.planet_updated
-                if feedparser._parse_date_iso8601(updated) >= activity_horizon:
+                if feedparser.datetimes.iso8601._parse_date_iso8601(updated) >= activity_horizon:
                     return
         else:
             if feed_info.feed.planet_message.startswith("no activity in"):
@@ -158,7 +158,7 @@ def writeCache(feed_uri, feed_info, data):
                 link['type'] = feedtype
                 break
         else:
-            data.feed.links.append(feedparser.FeedParserDict(
+            data.feed.links.append(feedparser.util.FeedParserDict(
                 {'rel':'self', 'type':feedtype, 'href':feed_uri}))
     for name, value in config.feed_options(feed_uri).items():
         data.feed['planet_'+name] = value
@@ -226,7 +226,7 @@ def writeCache(feed_uri, feed_info, data):
 
         # apply any filters
         xdoc = reconstitute.reconstitute(data, entry)
-        output = xdoc.toxml().encode('utf-8')
+        output = xdoc.toxml("utf-8")
         xdoc.unlink()
         for filter in config.filters(feed_uri):
             output = shell.run(filter, output, mode="filter")
@@ -257,7 +257,7 @@ def writeCache(feed_uri, feed_info, data):
             data.feed['planet_updated'] = \
                 time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1])
         elif data.feed.has_key('planet_updated'):
-           updated = [feedparser._parse_date_iso8601(data.feed.planet_updated)]
+           updated = [feedparser.datetimes.iso8601._parse_date_iso8601(data.feed.planet_updated)]
 
         if not updated or updated[-1] < activity_horizon:
             msg = "no activity in %d days" % config.activity_threshold(feed_uri)
@@ -287,7 +287,7 @@ def writeCache(feed_uri, feed_info, data):
     xdoc=minidom.parseString('''<feed xmlns:planet="%s"
       xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
     reconstitute.source(xdoc.documentElement,data.feed,data.bozo,data.version)
-    write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
+    write(xdoc.toxml("utf-8"), filename(sources, feed_uri))
     xdoc.unlink()
 
 def httpThread(thread_index, input_queue, output_queue, log):
@@ -301,7 +301,7 @@ def httpThread(thread_index, input_queue, output_queue, log):
         feed = StringIO('')
         setattr(feed, 'url', uri)
         setattr(feed, 'headers', 
-            feedparser.FeedParserDict({'status':'500'}))
+            feedparser.util.FeedParserDict({'status':'500'}))
         try:
             # map IRI => URI
             try:
@@ -448,7 +448,7 @@ def spiderPlanet(only_if_new = False):
 
                     data = feedparser.parse(feed, **options)
                 else:
-                    data = feedparser.FeedParserDict({'version': None,
+                    data = feedparser.util.FeedParserDict({'version': None,
                         'headers': feed.headers, 'entries': [], 'feed': {},
                         'href': feed.url, 'bozo': 0,
                         'status': int(feed.headers.status)})
diff --git a/planet/vendor/feedparser.py b/planet/vendor/feedparser.py
deleted file mode 100755
index 76167ce..0000000
--- a/planet/vendor/feedparser.py
+++ /dev/null
@@ -1,3689 +0,0 @@
-#!/usr/bin/env python
-"""Universal feed parser
-
-Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
-
-Visit http://feedparser.org/ for the latest version
-Visit http://feedparser.org/docs/ for the latest documentation
-
-Required: Python 2.1 or later
-Recommended: Python 2.3 or later
-Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
-"""
-
-__version__ = "4.2-pre-" + "$Revision: 314 $"[11:14] + "-svn"
-__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE."""
-__author__ = "Mark Pilgrim <http://diveintomark.org/>"
-__contributors__ = ["Jason Diamond <http://injektilo.org/>",
-                    "John Beimler <http://john.beimler.org/>",
-                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",
-                    "Aaron Swartz <http://aaronsw.com/>",
-                    "Kevin Marks <http://epeus.blogspot.com/>",
-                    "Sam Ruby <http://intertwingly.net/>",
-                    "Ade Oshineye <http://blog.oshineye.com/>"]
-_debug = 0
-
-# HTTP "User-Agent" header to send to servers when downloading feeds.
-# If you are embedding feedparser in a larger application, you should
-# change this to your application name and URL.
-USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
-
-# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
-# want to send an Accept header, set this to None.
-ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
-
-# List of preferred XML parsers, by SAX driver name.  These will be tried first,
-# but if they're not installed, Python will keep searching through its own list
-# of pre-installed parsers until it finds one that supports everything we need.
-PREFERRED_XML_PARSERS = ["drv_libxml2"]
-
-# If you want feedparser to automatically run HTML markup through HTML Tidy, set
-# this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
-# or utidylib <http://utidylib.berlios.de/>.
-TIDY_MARKUP = 0
-
-# List of Python interfaces for HTML Tidy, in order of preference.  Only useful
-# if TIDY_MARKUP = 1
-PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
-
-# If you want feedparser to automatically resolve all relative URIs, set this
-# to 1.
-RESOLVE_RELATIVE_URIS = 1
-
-# If you want feedparser to automatically sanitize all potentially unsafe
-# HTML content, set this to 1.
-SANITIZE_HTML = 1
-
-# ---------- required modules (should come with any Python distribution) ----------
-import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
-try:
-    from cStringIO import StringIO as _StringIO
-except:
-    from StringIO import StringIO as _StringIO
-
-# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
-
-# gzip is included with most Python distributions, but may not be available if you compiled your own
-try:
-    import gzip
-except:
-    gzip = None
-try:
-    import zlib
-except:
-    zlib = None
-
-# If a real XML parser is available, feedparser will attempt to use it.  feedparser has
-# been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
-# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
-# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
-try:
-    import xml.sax
-    xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
-    from xml.sax.saxutils import escape as _xmlescape
-    _XML_AVAILABLE = 1
-except:
-    _XML_AVAILABLE = 0
-    def _xmlescape(data,entities={}):
-        data = data.replace('&', '&amp;')
-        data = data.replace('>', '&gt;')
-        data = data.replace('<', '&lt;')
-        for char, entity in entities:
-            data = data.replace(char, entity)
-        return data
-
-# base64 support for Atom feeds that contain embedded binary data
-try:
-    import base64, binascii
-except:
-    base64 = binascii = None
-
-# cjkcodecs and iconv_codec provide support for more character encodings.
-# Both are available from http://cjkpython.i18n.org/
-try:
-    import cjkcodecs.aliases
-except:
-    pass
-try:
-    import iconv_codec
-except:
-    pass
-
-# chardet library auto-detects character encodings
-# Download from http://chardet.feedparser.org/
-try:
-    import chardet
-    if _debug:
-        import chardet.constants
-        chardet.constants._debug = 1
-except:
-    chardet = None
-
-# reversable htmlentitydefs mappings for Python 2.2
-try:
-  from htmlentitydefs import name2codepoint, codepoint2name
-except:
-  import htmlentitydefs
-  name2codepoint={}
-  codepoint2name={}
-  for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
-    if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
-    name2codepoint[name]=ord(codepoint)
-    codepoint2name[ord(codepoint)]=name
-
-# BeautifulSoup parser used for parsing microformats from embedded HTML content
-# http://www.crummy.com/software/BeautifulSoup/
-# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
-# older 2.x series.  If it doesn't, and you can figure out why, I'll accept a
-# patch and modify the compatibility statement accordingly.
-try:
-    import BeautifulSoup
-except:
-    BeautifulSoup = None
-
-# ---------- don't touch these ----------
-class ThingsNobodyCaresAboutButMe(Exception): pass
-class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
-class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
-class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
-class UndeclaredNamespace(Exception): pass
-
-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
-sgmllib.special = re.compile('<!')
-sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
-
-if sgmllib.endbracket.search(' <').start(0):
-    class EndBracketMatch:
-        endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
-        def search(self,string,index=0):
-            self.match = self.endbracket.match(string,index)
-            if self.match: return self
-        def start(self,n):
-            return self.match.end(n)
-    sgmllib.endbracket = EndBracketMatch()
-
-SUPPORTED_VERSIONS = {'': 'unknown',
-                      'rss090': 'RSS 0.90',
-                      'rss091n': 'RSS 0.91 (Netscape)',
-                      'rss091u': 'RSS 0.91 (Userland)',
-                      'rss092': 'RSS 0.92',
-                      'rss093': 'RSS 0.93',
-                      'rss094': 'RSS 0.94',
-                      'rss20': 'RSS 2.0',
-                      'rss10': 'RSS 1.0',
-                      'rss': 'RSS (unknown version)',
-                      'atom01': 'Atom 0.1',
-                      'atom02': 'Atom 0.2',
-                      'atom03': 'Atom 0.3',
-                      'atom10': 'Atom 1.0',
-                      'atom': 'Atom (unknown version)',
-                      'cdf': 'CDF',
-                      'hotrss': 'Hot RSS'
-                      }
-
-try:
-    UserDict = dict
-except NameError:
-    # Python 2.1 does not have dict
-    from UserDict import UserDict
-    def dict(aList):
-        rc = {}
-        for k, v in aList:
-            rc[k] = v
-        return rc
-
-class FeedParserDict(UserDict):
-    keymap = {'channel': 'feed',
-              'items': 'entries',
-              'guid': 'id',
-              'date': 'updated',
-              'date_parsed': 'updated_parsed',
-              'description': ['subtitle', 'summary'],
-              'url': ['href'],
-              'modified': 'updated',
-              'modified_parsed': 'updated_parsed',
-              'issued': 'published',
-              'issued_parsed': 'published_parsed',
-              'copyright': 'rights',
-              'copyright_detail': 'rights_detail',
-              'tagline': 'subtitle',
-              'tagline_detail': 'subtitle_detail'}
-    def __getitem__(self, key):
-        if key == 'category':
-            return UserDict.__getitem__(self, 'tags')[0]['term']
-        if key == 'enclosures':
-            norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
-            return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
-        if key == 'license':
-            for link in UserDict.__getitem__(self, 'links'):
-                if link['rel']=='license' and link.has_key('href'):
-                    return link['href']
-        if key == 'categories':
-            return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
-        realkey = self.keymap.get(key, key)
-        if type(realkey) == types.ListType:
-            for k in realkey:
-                if UserDict.has_key(self, k):
-                    return UserDict.__getitem__(self, k)
-        if UserDict.has_key(self, key):
-            return UserDict.__getitem__(self, key)
-        return UserDict.__getitem__(self, realkey)
-
-    def __setitem__(self, key, value):
-        for k in self.keymap.keys():
-            if key == k:
-                key = self.keymap[k]
-                if type(key) == types.ListType:
-                    key = key[0]
-        return UserDict.__setitem__(self, key, value)
-
-    def get(self, key, default=None):
-        if self.has_key(key):
-            return self[key]
-        else:
-            return default
-
-    def setdefault(self, key, value):
-        if not self.has_key(key):
-            self[key] = value
-        return self[key]
-        
-    def has_key(self, key):
-        try:
-            return hasattr(self, key) or UserDict.has_key(self, key)
-        except AttributeError:
-            return False
-        
-    def __getattr__(self, key):
-        try:
-            return self.__dict__[key]
-        except KeyError:
-            pass
-        try:
-            assert not key.startswith('_')
-            return self.__getitem__(key)
-        except:
-            raise AttributeError, "object has no attribute '%s'" % key
-
-    def __setattr__(self, key, value):
-        if key.startswith('_') or key == 'data':
-            self.__dict__[key] = value
-        else:
-            return self.__setitem__(key, value)
-
-    def __contains__(self, key):
-        return self.has_key(key)
-
-def zopeCompatibilityHack():
-    global FeedParserDict
-    del FeedParserDict
-    def FeedParserDict(aDict=None):
-        rc = {}
-        if aDict:
-            rc.update(aDict)
-        return rc
-
-_ebcdic_to_ascii_map = None
-def _ebcdic_to_ascii(s):
-    global _ebcdic_to_ascii_map
-    if not _ebcdic_to_ascii_map:
-        emap = (
-            0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
-            16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
-            128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
-            144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
-            32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
-            38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
-            45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
-            186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
-            195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
-            202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
-            209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
-            216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
-            123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
-            125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
-            92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
-            48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
-            )
-        import string
-        _ebcdic_to_ascii_map = string.maketrans( \
-            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
-    return s.translate(_ebcdic_to_ascii_map)
- 
-_cp1252 = {
-  unichr(128): unichr(8364), # euro sign
-  unichr(130): unichr(8218), # single low-9 quotation mark
-  unichr(131): unichr( 402), # latin small letter f with hook
-  unichr(132): unichr(8222), # double low-9 quotation mark
-  unichr(133): unichr(8230), # horizontal ellipsis
-  unichr(134): unichr(8224), # dagger
-  unichr(135): unichr(8225), # double dagger
-  unichr(136): unichr( 710), # modifier letter circumflex accent
-  unichr(137): unichr(8240), # per mille sign
-  unichr(138): unichr( 352), # latin capital letter s with caron
-  unichr(139): unichr(8249), # single left-pointing angle quotation mark
-  unichr(140): unichr( 338), # latin capital ligature oe
-  unichr(142): unichr( 381), # latin capital letter z with caron
-  unichr(145): unichr(8216), # left single quotation mark
-  unichr(146): unichr(8217), # right single quotation mark
-  unichr(147): unichr(8220), # left double quotation mark
-  unichr(148): unichr(8221), # right double quotation mark
-  unichr(149): unichr(8226), # bullet
-  unichr(150): unichr(8211), # en dash
-  unichr(151): unichr(8212), # em dash
-  unichr(152): unichr( 732), # small tilde
-  unichr(153): unichr(8482), # trade mark sign
-  unichr(154): unichr( 353), # latin small letter s with caron
-  unichr(155): unichr(8250), # single right-pointing angle quotation mark
-  unichr(156): unichr( 339), # latin small ligature oe
-  unichr(158): unichr( 382), # latin small letter z with caron
-  unichr(159): unichr( 376)} # latin capital letter y with diaeresis
-
-_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
-def _urljoin(base, uri):
-    uri = _urifixer.sub(r'\1\3', uri)
-    try:
-        return urlparse.urljoin(base, uri)
-    except:
-        uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
-        return urlparse.urljoin(base, uri)
-
-class _FeedParserMixin:
-    namespaces = {'': '',
-                  'http://backend.userland.com/rss': '',
-                  'http://blogs.law.harvard.edu/tech/rss': '',
-                  'http://purl.org/rss/1.0/': '',
-                  'http://my.netscape.com/rdf/simple/0.9/': '',
-                  'http://example.com/newformat#': '',
-                  'http://example.com/necho': '',
-                  'http://purl.org/echo/': '',
-                  'uri/of/echo/namespace#': '',
-                  'http://purl.org/pie/': '',
-                  'http://purl.org/atom/ns#': '',
-                  'http://www.w3.org/2005/Atom': '',
-                  'http://purl.org/rss/1.0/modules/rss091#': '',
-                  
-                  'http://webns.net/mvcb/':                               'admin',
-                  'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
-                  'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
-                  'http://media.tangent.org/rss/1.0/':                    'audio',
-                  'http://backend.userland.com/blogChannelModule':        'blogChannel',
-                  'http://web.resource.org/cc/':                          'cc',
-                  'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
-                  'http://purl.org/rss/1.0/modules/company':              'co',
-                  'http://purl.org/rss/1.0/modules/content/':             'content',
-                  'http://my.theinfo.org/changed/1.0/rss/':               'cp',
-                  'http://purl.org/dc/elements/1.1/':                     'dc',
-                  'http://purl.org/dc/terms/':                            'dcterms',
-                  'http://purl.org/rss/1.0/modules/email/':               'email',
-                  'http://purl.org/rss/1.0/modules/event/':               'ev',
-                  'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
-                  'http://freshmeat.net/rss/fm/':                         'fm',
-                  'http://xmlns.com/foaf/0.1/':                           'foaf',
-                  'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
-                  'http://postneo.com/icbm/':                             'icbm',
-                  'http://purl.org/rss/1.0/modules/image/':               'image',
-                  'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
-                  'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
-                  'http://purl.org/rss/1.0/modules/link/':                'l',
-                  'http://search.yahoo.com/mrss':                         'media',
-                  #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
-                  'http://search.yahoo.com/mrss/':                         'media',
-                  'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
-                  'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
-                  'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
-                  'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
-                  'http://purl.org/rss/1.0/modules/reference/':           'ref',
-                  'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
-                  'http://purl.org/rss/1.0/modules/search/':              'search',
-                  'http://purl.org/rss/1.0/modules/slash/':               'slash',
-                  'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
-                  'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
-                  'http://hacks.benhammersley.com/rss/streaming/':        'str',
-                  'http://purl.org/rss/1.0/modules/subscription/':        'sub',
-                  'http://purl.org/rss/1.0/modules/syndication/':         'sy',
-                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf',
-                  'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
-                  'http://purl.org/rss/1.0/modules/threading/':           'thr',
-                  'http://purl.org/rss/1.0/modules/textinput/':           'ti',
-                  'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
-                  'http://wellformedweb.org/commentAPI/':                 'wfw',
-                  'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
-                  'http://www.w3.org/1999/xhtml':                         'xhtml',
-                  'http://www.w3.org/1999/xlink':                         'xlink',
-                  'http://www.w3.org/XML/1998/namespace':                 'xml'
-}
-    _matchnamespaces = {}
-
-    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
-    can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
-    can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
-    html_types = ['text/html', 'application/xhtml+xml']
-    
-    def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
-        if _debug: sys.stderr.write('initializing FeedParser\n')
-        if not self._matchnamespaces:
-            for k, v in self.namespaces.items():
-                self._matchnamespaces[k.lower()] = v
-        self.feeddata = FeedParserDict() # feed-level data
-        self.encoding = encoding # character encoding
-        self.entries = [] # list of entry-level data
-        self.version = '' # feed type/version, see SUPPORTED_VERSIONS
-        self.namespacesInUse = {} # dictionary of namespaces defined by the feed
-
-        # the following are used internally to track state;
-        # this is really out of control and should be refactored
-        self.infeed = 0
-        self.inentry = 0
-        self.incontent = 0
-        self.intextinput = 0
-        self.inimage = 0
-        self.inauthor = 0
-        self.incontributor = 0
-        self.inpublisher = 0
-        self.insource = 0
-        self.sourcedata = FeedParserDict()
-        self.contentparams = FeedParserDict()
-        self._summaryKey = None
-        self.namespacemap = {}
-        self.elementstack = []
-        self.basestack = []
-        self.langstack = []
-        self.baseuri = baseuri or ''
-        self.lang = baselang or None
-        self.svgOK = 0
-        self.hasTitle = 0
-        if baselang:
-            self.feeddata['language'] = baselang.replace('_','-')
-
-    def unknown_starttag(self, tag, attrs):
-        if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
-        # normalize attrs
-        attrs = [(k.lower(), v) for k, v in attrs]
-        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
-        
-        # track xml:base and xml:lang
-        attrsD = dict(attrs)
-        baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
-        if type(baseuri) != type(u''):
-            try:
-                baseuri = unicode(baseuri, self.encoding)
-            except:
-                baseuri = unicode(baseuri, 'iso-8859-1')
-        self.baseuri = _urljoin(self.baseuri, baseuri)
-        lang = attrsD.get('xml:lang', attrsD.get('lang'))
-        if lang == '':
-            # xml:lang could be explicitly set to '', we need to capture that
-            lang = None
-        elif lang is None:
-            # if no xml:lang is specified, use parent lang
-            lang = self.lang
-        if lang:
-            if tag in ('feed', 'rss', 'rdf:RDF'):
-                self.feeddata['language'] = lang.replace('_','-')
-        self.lang = lang
-        self.basestack.append(self.baseuri)
-        self.langstack.append(lang)
-        
-        # track namespaces
-        for prefix, uri in attrs:
-            if prefix.startswith('xmlns:'):
-                self.trackNamespace(prefix[6:], uri)
-            elif prefix == 'xmlns':
-                self.trackNamespace(None, uri)
-
-        # track inline content
-        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
-            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
-            # element declared itself as escaped markup, but it isn't really
-            self.contentparams['type'] = 'application/xhtml+xml'
-        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
-            if tag.find(':') <> -1:
-                prefix, tag = tag.split(':', 1)
-                namespace = self.namespacesInUse.get(prefix, '')
-                if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
-                    attrs.append(('xmlns',namespace))
-                if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
-                    attrs.append(('xmlns',namespace))
-            if tag == 'svg': self.svgOK += 1
-            return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
-
-        # match namespaces
-        if tag.find(':') <> -1:
-            prefix, suffix = tag.split(':', 1)
-        else:
-            prefix, suffix = '', tag
-        prefix = self.namespacemap.get(prefix, prefix)
-        if prefix:
-            prefix = prefix + '_'
-
-        # special hack for better tracking of empty textinput/image elements in illformed feeds
-        if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
-            self.intextinput = 0
-        if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
-            self.inimage = 0
-        
-        # call special handler (if defined) or default handler
-        methodname = '_start_' + prefix + suffix
-        try:
-            method = getattr(self, methodname)
-            return method(attrsD)
-        except AttributeError:
-            # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
-            unknown_tag = prefix + suffix
-            if len(attrsD) == 0:
-                # No attributes so merge it into the encosing dictionary
-                return self.push(unknown_tag, 1)
-            else:
-                # Has attributes so create it in its own dictionary
-                context = self._getContext()
-                context[unknown_tag] = attrsD
-
-    def unknown_endtag(self, tag):
-        if _debug: sys.stderr.write('end %s\n' % tag)
-        # match namespaces
-        if tag.find(':') <> -1:
-            prefix, suffix = tag.split(':', 1)
-        else:
-            prefix, suffix = '', tag
-        prefix = self.namespacemap.get(prefix, prefix)
-        if prefix:
-            prefix = prefix + '_'
-        if suffix == 'svg' and self.svgOK: self.svgOK -= 1
-
-        # call special handler (if defined) or default handler
-        methodname = '_end_' + prefix + suffix
-        try:
-            if self.svgOK: raise AttributeError()
-            method = getattr(self, methodname)
-            method()
-        except AttributeError:
-            self.pop(prefix + suffix)
-
-        # track inline content
-        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
-            # element declared itself as escaped markup, but it isn't really
-            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
-            self.contentparams['type'] = 'application/xhtml+xml'
-        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
-            tag = tag.split(':')[-1]
-            self.handle_data('</%s>' % tag, escape=0)
-
-        # track xml:base and xml:lang going out of scope
-        if self.basestack:
-            self.basestack.pop()
-            if self.basestack and self.basestack[-1]:
-                self.baseuri = self.basestack[-1]
-        if self.langstack:
-            self.langstack.pop()
-            if self.langstack: # and (self.langstack[-1] is not None):
-                self.lang = self.langstack[-1]
-
-    def handle_charref(self, ref):
-        # called for each character reference, e.g. for '&#160;', ref will be '160'
-        if not self.elementstack: return
-        ref = ref.lower()
-        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
-            text = '&#%s;' % ref
-        else:
-            if ref[0] == 'x':
-                c = int(ref[1:], 16)
-            else:
-                c = int(ref)
-            text = unichr(c).encode('utf-8')
-        self.elementstack[-1][2].append(text)
-
-    def handle_entityref(self, ref):
-        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
-        if not self.elementstack: return
-        if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
-        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
-            text = '&%s;' % ref
-        elif ref in self.entities.keys():
-            text = self.entities[ref]
-            if text.startswith('&#') and text.endswith(';'):
-                return self.handle_entityref(text)
-        else:
-            try: name2codepoint[ref]
-            except KeyError: text = '&%s;' % ref
-            else: text = unichr(name2codepoint[ref]).encode('utf-8')
-        self.elementstack[-1][2].append(text)
-
-    def handle_data(self, text, escape=1):
-        # called for each block of plain text, i.e. outside of any tag and
-        # not containing any character or entity references
-        if not self.elementstack: return
-        if escape and self.contentparams.get('type') == 'application/xhtml+xml':
-            text = _xmlescape(text)
-        self.elementstack[-1][2].append(text)
-
-    def handle_comment(self, text):
-        # called for each comment, e.g. <!-- insert message here -->
-        pass
-
-    def handle_pi(self, text):
-        # called for each processing instruction, e.g. <?instruction>
-        pass
-
-    def handle_decl(self, text):
-        pass
-
-    def parse_declaration(self, i):
-        # override internal declaration handler to handle CDATA blocks
-        if _debug: sys.stderr.write('entering parse_declaration\n')
-        if self.rawdata[i:i+9] == '<![CDATA[':
-            k = self.rawdata.find(']]>', i)
-            if k == -1:
-                # CDATA block began but didn't finish
-                k = len(self.rawdata)
-                return k
-            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
-            return k+3
-        else:
-            k = self.rawdata.find('>', i)
-            if k >= 0:
-                return k+1
-            else:
-                # We have an incomplete CDATA block.
-                return k
-
-    def mapContentType(self, contentType):
-        contentType = contentType.lower()
-        if contentType == 'text':
-            contentType = 'text/plain'
-        elif contentType == 'html':
-            contentType = 'text/html'
-        elif contentType == 'xhtml':
-            contentType = 'application/xhtml+xml'
-        return contentType
-    
-    def trackNamespace(self, prefix, uri):
-        loweruri = uri.lower()
-        if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
-            self.version = 'rss090'
-        if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
-            self.version = 'rss10'
-        if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
-            self.version = 'atom10'
-        if loweruri.find('backend.userland.com/rss') <> -1:
-            # match any backend.userland.com namespace
-            uri = 'http://backend.userland.com/rss'
-            loweruri = uri
-        if self._matchnamespaces.has_key(loweruri):
-            self.namespacemap[prefix] = self._matchnamespaces[loweruri]
-            self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
-        else:
-            self.namespacesInUse[prefix or ''] = uri
-
-    def resolveURI(self, uri):
-        return _urljoin(self.baseuri or '', uri)
-    
-    def decodeEntities(self, element, data):
-        return data
-
-    def strattrs(self, attrs):
-        return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
-
-    def push(self, element, expectingText):
-        self.elementstack.append([element, expectingText, []])
-
-    def pop(self, element, stripWhitespace=1):
-        if not self.elementstack: return
-        if self.elementstack[-1][0] != element: return
-        
-        element, expectingText, pieces = self.elementstack.pop()
-
-        if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
-            # remove enclosing child element, but only if it is a <div> and
-            # only if all the remaining content is nested underneath it.
-            # This means that the divs would be retained in the following:
-            #    <div>foo</div><div>bar</div>
-            while pieces and len(pieces)>1 and not pieces[-1].strip():
-                del pieces[-1]
-            while pieces and len(pieces)>1 and not pieces[0].strip():
-                del pieces[0]
-            if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
-                depth = 0
-                for piece in pieces[:-1]:
-                    if piece.startswith('</'):
-                        depth -= 1
-                        if depth == 0: break
-                    elif piece.startswith('<') and not piece.endswith('/>'):
-                        depth += 1
-                else:
-                    pieces = pieces[1:-1]
-
-        output = ''.join(pieces)
-        if stripWhitespace:
-            output = output.strip()
-        if not expectingText: return output
-
-        # decode base64 content
-        if base64 and self.contentparams.get('base64', 0):
-            try:
-                output = base64.decodestring(output)
-            except binascii.Error:
-                pass
-            except binascii.Incomplete:
-                pass
-                
-        # resolve relative URIs
-        if (element in self.can_be_relative_uri) and output:
-            output = self.resolveURI(output)
-        
-        # decode entities within embedded markup
-        if not self.contentparams.get('base64', 0):
-            output = self.decodeEntities(element, output)
-
-        if self.lookslikehtml(output):
-            self.contentparams['type']='text/html'
-
-        # remove temporary cruft from contentparams
-        try:
-            del self.contentparams['mode']
-        except KeyError:
-            pass
-        try:
-            del self.contentparams['base64']
-        except KeyError:
-            pass
-
-        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
-        # resolve relative URIs within embedded markup
-        if is_htmlish and RESOLVE_RELATIVE_URIS:
-            if element in self.can_contain_relative_uris:
-                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
-                
-        # parse microformats
-        # (must do this before sanitizing because some microformats
-        # rely on elements that we sanitize)
-        if is_htmlish and element in ['content', 'description', 'summary']:
-            mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
-            if mfresults:
-                for tag in mfresults.get('tags', []):
-                    self._addTag(tag['term'], tag['scheme'], tag['label'])
-                for enclosure in mfresults.get('enclosures', []):
-                    self._start_enclosure(enclosure)
-                for xfn in mfresults.get('xfn', []):
-                    self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
-                vcard = mfresults.get('vcard')
-                if vcard:
-                    self._getContext()['vcard'] = vcard
-        
-        # sanitize embedded markup
-        if is_htmlish and SANITIZE_HTML:
-            if element in self.can_contain_dangerous_markup:
-                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
-
-        if self.encoding and type(output) != type(u''):
-            try:
-                output = unicode(output, self.encoding)
-            except:
-                pass
-
-        # address common error where people take data that is already
-        # utf-8, presume that it is iso-8859-1, and re-encode it.
-        if self.encoding=='utf-8' and type(output) == type(u''):
-            try:
-                output = unicode(output.encode('iso-8859-1'), 'utf-8')
-            except:
-                pass
-
-        # map win-1252 extensions to the proper code points
-        if type(output) == type(u''):
-            output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
-
-        # categories/tags/keywords/whatever are handled in _end_category
-        if element == 'category':
-            return output
-
-        if element == 'title' and self.hasTitle:
-            return output
-        
-        # store output in appropriate place(s)
-        if self.inentry and not self.insource:
-            if element == 'content':
-                self.entries[-1].setdefault(element, [])
-                contentparams = copy.deepcopy(self.contentparams)
-                contentparams['value'] = output
-                self.entries[-1][element].append(contentparams)
-            elif element == 'link':
-                self.entries[-1][element] = output
-                if output:
-                    self.entries[-1]['links'][-1]['href'] = output
-            else:
-                if element == 'description':
-                    element = 'summary'
-                self.entries[-1][element] = output
-                if self.incontent:
-                    contentparams = copy.deepcopy(self.contentparams)
-                    contentparams['value'] = output
-                    self.entries[-1][element + '_detail'] = contentparams
-        elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
-            context = self._getContext()
-            if element == 'description':
-                element = 'subtitle'
-            context[element] = output
-            if element == 'link':
-                context['links'][-1]['href'] = output
-            elif self.incontent:
-                contentparams = copy.deepcopy(self.contentparams)
-                contentparams['value'] = output
-                context[element + '_detail'] = contentparams
-        return output
-
-    def pushContent(self, tag, attrsD, defaultContentType, expectingText):
-        self.incontent += 1
-        if self.lang: self.lang=self.lang.replace('_','-')
-        self.contentparams = FeedParserDict({
-            'type': self.mapContentType(attrsD.get('type', defaultContentType)),
-            'language': self.lang,
-            'base': self.baseuri})
-        self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
-        self.push(tag, expectingText)
-
-    def popContent(self, tag):
-        value = self.pop(tag)
-        self.incontent -= 1
-        self.contentparams.clear()
-        return value
-        
-    # a number of elements in a number of RSS variants are nominally plain
-    # text, but this is routinely ignored.  This is an attempt to detect
-    # the most common cases.  As false positives often result in silent
-    # data loss, this function errs on the conservative side.
-    def lookslikehtml(self, str):
-        if self.version.startswith('atom'): return
-        if self.contentparams.get('type','text/html') != 'text/plain': return
-
-        # must have a close tag or a entity reference to qualify
-        if not (re.search(r'</(\w+)>',str) or re.search("&#?\w+;",str)): return
-
-        # all tags must be in a restricted subset of valid HTML tags
-        if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
-            re.findall(r'</?(\w+)',str)): return
-
-        # all entities must have been defined as valid HTML entities
-        from htmlentitydefs import entitydefs
-        if filter(lambda e: e not in entitydefs.keys(),
-            re.findall(r'&(\w+);',str)): return
-
-        return 1
-
-    def _mapToStandardPrefix(self, name):
-        colonpos = name.find(':')
-        if colonpos <> -1:
-            prefix = name[:colonpos]
-            suffix = name[colonpos+1:]
-            prefix = self.namespacemap.get(prefix, prefix)
-            name = prefix + ':' + suffix
-        return name
-        
-    def _getAttribute(self, attrsD, name):
-        return attrsD.get(self._mapToStandardPrefix(name))
-
-    def _isBase64(self, attrsD, contentparams):
-        if attrsD.get('mode', '') == 'base64':
-            return 1
-        if self.contentparams['type'].startswith('text/'):
-            return 0
-        if self.contentparams['type'].endswith('+xml'):
-            return 0
-        if self.contentparams['type'].endswith('/xml'):
-            return 0
-        return 1
-
-    def _itsAnHrefDamnIt(self, attrsD):
-        href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
-        if href:
-            try:
-                del attrsD['url']
-            except KeyError:
-                pass
-            try:
-                del attrsD['uri']
-            except KeyError:
-                pass
-            attrsD['href'] = href
-        return attrsD
-    
-    def _save(self, key, value):
-        context = self._getContext()
-        context.setdefault(key, value)
-
-    def _start_rss(self, attrsD):
-        versionmap = {'0.91': 'rss091u',
-                      '0.92': 'rss092',
-                      '0.93': 'rss093',
-                      '0.94': 'rss094'}
-        #If we're here then this is an RSS feed.
-        #If we don't have a version or have a version that starts with something
-        #other than RSS then there's been a mistake. Correct it.
-        if not self.version or not self.version.startswith('rss'):
-            attr_version = attrsD.get('version', '')
-            version = versionmap.get(attr_version)
-            if version:
-                self.version = version
-            elif attr_version.startswith('2.'):
-                self.version = 'rss20'
-            else:
-                self.version = 'rss'
-    
-    def _start_dlhottitles(self, attrsD):
-        self.version = 'hotrss'
-
-    def _start_channel(self, attrsD):
-        self.infeed = 1
-        self._cdf_common(attrsD)
-    _start_feedinfo = _start_channel
-
-    def _cdf_common(self, attrsD):
-        if attrsD.has_key('lastmod'):
-            self._start_modified({})
-            self.elementstack[-1][-1] = attrsD['lastmod']
-            self._end_modified()
-        if attrsD.has_key('href'):
-            self._start_link({})
-            self.elementstack[-1][-1] = attrsD['href']
-            self._end_link()
-    
-    def _start_feed(self, attrsD):
-        self.infeed = 1
-        versionmap = {'0.1': 'atom01',
-                      '0.2': 'atom02',
-                      '0.3': 'atom03'}
-        if not self.version:
-            attr_version = attrsD.get('version')
-            version = versionmap.get(attr_version)
-            if version:
-                self.version = version
-            else:
-                self.version = 'atom'
-
-    def _end_channel(self):
-        self.infeed = 0
-    _end_feed = _end_channel
-    
-    def _start_image(self, attrsD):
-        context = self._getContext()
-        context.setdefault('image', FeedParserDict())
-        self.inimage = 1
-        self.hasTitle = 0
-        self.push('image', 0)
-            
-    def _end_image(self):
-        self.pop('image')
-        self.inimage = 0
-
-    def _start_textinput(self, attrsD):
-        context = self._getContext()
-        context.setdefault('textinput', FeedParserDict())
-        self.intextinput = 1
-        self.hasTitle = 0
-        self.push('textinput', 0)
-    _start_textInput = _start_textinput
-    
-    def _end_textinput(self):
-        self.pop('textinput')
-        self.intextinput = 0
-    _end_textInput = _end_textinput
-
-    def _start_author(self, attrsD):
-        self.inauthor = 1
-        self.push('author', 1)
-    _start_managingeditor = _start_author
-    _start_dc_author = _start_author
-    _start_dc_creator = _start_author
-    _start_itunes_author = _start_author
-
-    def _end_author(self):
-        self.pop('author')
-        self.inauthor = 0
-        self._sync_author_detail()
-    _end_managingeditor = _end_author
-    _end_dc_author = _end_author
-    _end_dc_creator = _end_author
-    _end_itunes_author = _end_author
-
-    def _start_itunes_owner(self, attrsD):
-        self.inpublisher = 1
-        self.push('publisher', 0)
-
-    def _end_itunes_owner(self):
-        self.pop('publisher')
-        self.inpublisher = 0
-        self._sync_author_detail('publisher')
-
-    def _start_contributor(self, attrsD):
-        self.incontributor = 1
-        context = self._getContext()
-        context.setdefault('contributors', [])
-        context['contributors'].append(FeedParserDict())
-        self.push('contributor', 0)
-
-    def _end_contributor(self):
-        self.pop('contributor')
-        self.incontributor = 0
-
-    def _start_dc_contributor(self, attrsD):
-        self.incontributor = 1
-        context = self._getContext()
-        context.setdefault('contributors', [])
-        context['contributors'].append(FeedParserDict())
-        self.push('name', 0)
-
-    def _end_dc_contributor(self):
-        self._end_name()
-        self.incontributor = 0
-
-    def _start_name(self, attrsD):
-        self.push('name', 0)
-    _start_itunes_name = _start_name
-
-    def _end_name(self):
-        value = self.pop('name')
-        if self.inpublisher:
-            self._save_author('name', value, 'publisher')
-        elif self.inauthor:
-            self._save_author('name', value)
-        elif self.incontributor:
-            self._save_contributor('name', value)
-        elif self.intextinput:
-            context = self._getContext()
-            context['name'] = value
-    _end_itunes_name = _end_name
-
-    def _start_width(self, attrsD):
-        self.push('width', 0)
-
-    def _end_width(self):
-        value = self.pop('width')
-        try:
-            value = int(value)
-        except:
-            value = 0
-        if self.inimage:
-            context = self._getContext()
-            context['width'] = value
-
-    def _start_height(self, attrsD):
-        self.push('height', 0)
-
-    def _end_height(self):
-        value = self.pop('height')
-        try:
-            value = int(value)
-        except:
-            value = 0
-        if self.inimage:
-            context = self._getContext()
-            context['height'] = value
-
-    def _start_url(self, attrsD):
-        self.push('href', 1)
-    _start_homepage = _start_url
-    _start_uri = _start_url
-
-    def _end_url(self):
-        value = self.pop('href')
-        if self.inauthor:
-            self._save_author('href', value)
-        elif self.incontributor:
-            self._save_contributor('href', value)
-    _end_homepage = _end_url
-    _end_uri = _end_url
-
-    def _start_email(self, attrsD):
-        self.push('email', 0)
-    _start_itunes_email = _start_email
-
-    def _end_email(self):
-        value = self.pop('email')
-        if self.inpublisher:
-            self._save_author('email', value, 'publisher')
-        elif self.inauthor:
-            self._save_author('email', value)
-        elif self.incontributor:
-            self._save_contributor('email', value)
-    _end_itunes_email = _end_email
-
-    def _getContext(self):
-        if self.insource:
-            context = self.sourcedata
-        elif self.inimage and self.feeddata.has_key('image'):
-            context = self.feeddata['image']
-        elif self.intextinput:
-            context = self.feeddata['textinput']
-        elif self.inentry:
-            context = self.entries[-1]
-        else:
-            context = self.feeddata
-        return context
-
-    def _save_author(self, key, value, prefix='author'):
-        context = self._getContext()
-        context.setdefault(prefix + '_detail', FeedParserDict())
-        context[prefix + '_detail'][key] = value
-        self._sync_author_detail()
-
-    def _save_contributor(self, key, value):
-        context = self._getContext()
-        context.setdefault('contributors', [FeedParserDict()])
-        context['contributors'][-1][key] = value
-
-    def _sync_author_detail(self, key='author'):
-        context = self._getContext()
-        detail = context.get('%s_detail' % key)
-        if detail:
-            name = detail.get('name')
-            email = detail.get('email')
-            if name and email:
-                context[key] = '%s (%s)' % (name, email)
-            elif name:
-                context[key] = name
-            elif email:
-                context[key] = email
-        else:
-            author, email = context.get(key), None
-            if not author: return
-            emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
-            if emailmatch:
-                email = emailmatch.group(0)
-                # probably a better way to do the following, but it passes all the tests
-                author = author.replace(email, '')
-                author = author.replace('()', '')
-                author = author.replace('<>', '')
-                author = author.replace('&lt;&gt;', '')
-                author = author.strip()
-                if author and (author[0] == '('):
-                    author = author[1:]
-                if author and (author[-1] == ')'):
-                    author = author[:-1]
-                author = author.strip()
-            if author or email:
-                context.setdefault('%s_detail' % key, FeedParserDict())
-            if author:
-                context['%s_detail' % key]['name'] = author
-            if email:
-                context['%s_detail' % key]['email'] = email
-
-    def _start_subtitle(self, attrsD):
-        self.pushContent('subtitle', attrsD, 'text/plain', 1)
-    _start_tagline = _start_subtitle
-    _start_itunes_subtitle = _start_subtitle
-
-    def _end_subtitle(self):
-        self.popContent('subtitle')
-    _end_tagline = _end_subtitle
-    _end_itunes_subtitle = _end_subtitle
-            
-    def _start_rights(self, attrsD):
-        self.pushContent('rights', attrsD, 'text/plain', 1)
-    _start_dc_rights = _start_rights
-    _start_copyright = _start_rights
-
-    def _end_rights(self):
-        self.popContent('rights')
-    _end_dc_rights = _end_rights
-    _end_copyright = _end_rights
-
-    def _start_item(self, attrsD):
-        self.entries.append(FeedParserDict())
-        self.push('item', 0)
-        self.inentry = 1
-        self.guidislink = 0
-        self.hasTitle = 0
-        id = self._getAttribute(attrsD, 'rdf:about')
-        if id:
-            context = self._getContext()
-            context['id'] = id
-        self._cdf_common(attrsD)
-    _start_entry = _start_item
-    _start_product = _start_item
-
-    def _end_item(self):
-        self.pop('item')
-        self.inentry = 0
-    _end_entry = _end_item
-
-    def _start_dc_language(self, attrsD):
-        self.push('language', 1)
-    _start_language = _start_dc_language
-
-    def _end_dc_language(self):
-        self.lang = self.pop('language')
-    _end_language = _end_dc_language
-
-    def _start_dc_publisher(self, attrsD):
-        self.push('publisher', 1)
-    _start_webmaster = _start_dc_publisher
-
-    def _end_dc_publisher(self):
-        self.pop('publisher')
-        self._sync_author_detail('publisher')
-    _end_webmaster = _end_dc_publisher
-
-    def _start_published(self, attrsD):
-        self.push('published', 1)
-    _start_dcterms_issued = _start_published
-    _start_issued = _start_published
-
-    def _end_published(self):
-        value = self.pop('published')
-        self._save('published_parsed', _parse_date(value))
-    _end_dcterms_issued = _end_published
-    _end_issued = _end_published
-
-    def _start_updated(self, attrsD):
-        self.push('updated', 1)
-    _start_modified = _start_updated
-    _start_dcterms_modified = _start_updated
-    _start_pubdate = _start_updated
-    _start_dc_date = _start_updated
-
-    def _end_updated(self):
-        value = self.pop('updated')
-        parsed_value = _parse_date(value)
-        self._save('updated_parsed', parsed_value)
-    _end_modified = _end_updated
-    _end_dcterms_modified = _end_updated
-    _end_pubdate = _end_updated
-    _end_dc_date = _end_updated
-
-    def _start_created(self, attrsD):
-        self.push('created', 1)
-    _start_dcterms_created = _start_created
-
-    def _end_created(self):
-        value = self.pop('created')
-        self._save('created_parsed', _parse_date(value))
-    _end_dcterms_created = _end_created
-
-    def _start_expirationdate(self, attrsD):
-        self.push('expired', 1)
-
-    def _end_expirationdate(self):
-        self._save('expired_parsed', _parse_date(self.pop('expired')))
-
-    def _start_cc_license(self, attrsD):
-        context = self._getContext()
-        value = self._getAttribute(attrsD, 'rdf:resource')
-        attrsD = FeedParserDict()
-        attrsD['rel']='license'
-        if value: attrsD['href']=value
-        context.setdefault('links', []).append(attrsD)
-        
-    def _start_creativecommons_license(self, attrsD):
-        self.push('license', 1)
-    _start_creativeCommons_license = _start_creativecommons_license
-
-    def _end_creativecommons_license(self):
-        value = self.pop('license')
-        context = self._getContext()
-        attrsD = FeedParserDict()
-        attrsD['rel']='license'
-        if value: attrsD['href']=value
-        context.setdefault('links', []).append(attrsD)
-        del context['license']
-    _end_creativeCommons_license = _end_creativecommons_license
-
-    def _addXFN(self, relationships, href, name):
-        context = self._getContext()
-        xfn = context.setdefault('xfn', [])
-        value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
-        if value not in xfn:
-            xfn.append(value)
-        
-    def _addTag(self, term, scheme, label):
-        context = self._getContext()
-        tags = context.setdefault('tags', [])
-        if (not term) and (not scheme) and (not label): return
-        value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
-        if value not in tags:
-            tags.append(value)
-
-    def _start_category(self, attrsD):
-        if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
-        term = attrsD.get('term')
-        scheme = attrsD.get('scheme', attrsD.get('domain'))
-        label = attrsD.get('label')
-        self._addTag(term, scheme, label)
-        self.push('category', 1)
-    _start_dc_subject = _start_category
-    _start_keywords = _start_category
-        
-    def _end_itunes_keywords(self):
-        for term in self.pop('itunes_keywords').split():
-            self._addTag(term, 'http://www.itunes.com/', None)
-        
-    def _start_itunes_category(self, attrsD):
-        self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
-        self.push('category', 1)
-        
-    def _end_category(self):
-        value = self.pop('category')
-        if not value: return
-        context = self._getContext()
-        tags = context['tags']
-        if value and len(tags) and not tags[-1]['term']:
-            tags[-1]['term'] = value
-        else:
-            self._addTag(value, None, None)
-    _end_dc_subject = _end_category
-    _end_keywords = _end_category
-    _end_itunes_category = _end_category
-
-    def _start_cloud(self, attrsD):
-        self._getContext()['cloud'] = FeedParserDict(attrsD)
-        
-    def _start_link(self, attrsD):
-        attrsD.setdefault('rel', 'alternate')
-        if attrsD['rel'] == 'self':
-            attrsD.setdefault('type', 'application/atom+xml')
-        else:
-            attrsD.setdefault('type', 'text/html')
-        context = self._getContext()
-        attrsD = self._itsAnHrefDamnIt(attrsD)
-        if attrsD.has_key('href'):
-            attrsD['href'] = self.resolveURI(attrsD['href'])
-            if attrsD.get('rel')=='enclosure' and not context.get('id'):
-                context['id'] = attrsD.get('href')
-        expectingText = self.infeed or self.inentry or self.insource
-        context.setdefault('links', [])
-        context['links'].append(FeedParserDict(attrsD))
-        if attrsD.has_key('href'):
-            expectingText = 0
-            if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
-                context['link'] = attrsD['href']
-        else:
-            self.push('link', expectingText)
-    _start_producturl = _start_link
-
-    def _end_link(self):
-        value = self.pop('link')
-        context = self._getContext()
-    _end_producturl = _end_link
-
-    def _start_guid(self, attrsD):
-        self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
-        self.push('id', 1)
-
-    def _end_guid(self):
-        value = self.pop('id')
-        self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
-        if self.guidislink:
-            # guid acts as link, but only if 'ispermalink' is not present or is 'true',
-            # and only if the item doesn't already have a link element
-            self._save('link', value)
-
-    def _start_title(self, attrsD):
-        if self.svgOK: return self.unknown_starttag('title', attrsD.items())
-        self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
-    _start_dc_title = _start_title
-    _start_media_title = _start_title
-
-    def _end_title(self):
-        if self.svgOK: return
-        value = self.popContent('title')
-        if not value: return
-        context = self._getContext()
-        self.hasTitle = 1
-    _end_dc_title = _end_title
-
-    def _end_media_title(self):
-        hasTitle = self.hasTitle
-        self._end_title()
-        self.hasTitle = hasTitle
-
-    def _start_description(self, attrsD):
-        context = self._getContext()
-        if context.has_key('summary'):
-            self._summaryKey = 'content'
-            self._start_content(attrsD)
-        else:
-            self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
-    _start_dc_description = _start_description
-
-    def _start_abstract(self, attrsD):
-        self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
-
-    def _end_description(self):
-        if self._summaryKey == 'content':
-            self._end_content()
-        else:
-            value = self.popContent('description')
-        self._summaryKey = None
-    _end_abstract = _end_description
-    _end_dc_description = _end_description
-
-    def _start_info(self, attrsD):
-        self.pushContent('info', attrsD, 'text/plain', 1)
-    _start_feedburner_browserfriendly = _start_info
-
-    def _end_info(self):
-        self.popContent('info')
-    _end_feedburner_browserfriendly = _end_info
-
-    def _start_generator(self, attrsD):
-        if attrsD:
-            attrsD = self._itsAnHrefDamnIt(attrsD)
-            if attrsD.has_key('href'):
-                attrsD['href'] = self.resolveURI(attrsD['href'])
-        self._getContext()['generator_detail'] = FeedParserDict(attrsD)
-        self.push('generator', 1)
-
-    def _end_generator(self):
-        value = self.pop('generator')
-        context = self._getContext()
-        if context.has_key('generator_detail'):
-            context['generator_detail']['name'] = value
-            
-    def _start_admin_generatoragent(self, attrsD):
-        self.push('generator', 1)
-        value = self._getAttribute(attrsD, 'rdf:resource')
-        if value:
-            self.elementstack[-1][2].append(value)
-        self.pop('generator')
-        self._getContext()['generator_detail'] = FeedParserDict({'href': value})
-
-    def _start_admin_errorreportsto(self, attrsD):
-        self.push('errorreportsto', 1)
-        value = self._getAttribute(attrsD, 'rdf:resource')
-        if value:
-            self.elementstack[-1][2].append(value)
-        self.pop('errorreportsto')
-        
-    def _start_summary(self, attrsD):
-        context = self._getContext()
-        if context.has_key('summary'):
-            self._summaryKey = 'content'
-            self._start_content(attrsD)
-        else:
-            self._summaryKey = 'summary'
-            self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
-    _start_itunes_summary = _start_summary
-
-    def _end_summary(self):
-        if self._summaryKey == 'content':
-            self._end_content()
-        else:
-            self.popContent(self._summaryKey or 'summary')
-        self._summaryKey = None
-    _end_itunes_summary = _end_summary
-        
-    def _start_enclosure(self, attrsD):
-        attrsD = self._itsAnHrefDamnIt(attrsD)
-        context = self._getContext()
-        attrsD['rel']='enclosure'
-        context.setdefault('links', []).append(FeedParserDict(attrsD))
-        href = attrsD.get('href')
-        if href and not context.get('id'):
-            context['id'] = href
-            
-    def _start_source(self, attrsD):
-        if 'url' in attrsD:
-          # This means that we're processing a source element from an RSS 2.0 feed
-          self.sourcedata['href'] = attrsD[u'url']
-        self.push('source', 1)
-        self.insource = 1
-        self.hasTitle = 0
-
-    def _end_source(self):
-        self.insource = 0
-        value = self.pop('source')
-        if value:
-          self.sourcedata['title'] = value
-        self._getContext()['source'] = copy.deepcopy(self.sourcedata)
-        self.sourcedata.clear()
-
-    def _start_content(self, attrsD):
-        self.pushContent('content', attrsD, 'text/plain', 1)
-        src = attrsD.get('src')
-        if src:
-            self.contentparams['src'] = src
-        self.push('content', 1)
-
-    def _start_prodlink(self, attrsD):
-        self.pushContent('content', attrsD, 'text/html', 1)
-
-    def _start_body(self, attrsD):
-        self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
-    _start_xhtml_body = _start_body
-
-    def _start_content_encoded(self, attrsD):
-        self.pushContent('content', attrsD, 'text/html', 1)
-    _start_fullitem = _start_content_encoded
-
-    def _end_content(self):
-        copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
-        value = self.popContent('content')
-        if copyToDescription:
-            self._save('description', value)
-
-    _end_body = _end_content
-    _end_xhtml_body = _end_content
-    _end_content_encoded = _end_content
-    _end_fullitem = _end_content
-    _end_prodlink = _end_content
-
-    def _start_itunes_image(self, attrsD):
-        self.push('itunes_image', 0)
-        self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
-    _start_itunes_link = _start_itunes_image
-        
-    def _end_itunes_block(self):
-        value = self.pop('itunes_block', 0)
-        self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
-
-    def _end_itunes_explicit(self):
-        value = self.pop('itunes_explicit', 0)
-        self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
-
-    def _start_media_content(self, attrsD):
-        context = self._getContext()
-        context.setdefault('media_content', [])
-        context['media_content'].append(attrsD)
-
-    def _start_media_thumbnail(self, attrsD):
-        context = self._getContext()
-        context.setdefault('media_thumbnail', [])
-        self.push('url', 1) # new
-        context['media_thumbnail'].append(attrsD)
-
-    def _end_media_thumbnail(self):
-        url = self.pop('url')
-        context = self._getContext()
-        if url != None and len(url.strip()) != 0:
-            if not context['media_thumbnail'][-1].has_key('url'):
-                context['media_thumbnail'][-1]['url'] = url
-
-    def _start_media_player(self, attrsD):
-        self.push('media_player', 0)
-        self._getContext()['media_player'] = FeedParserDict(attrsD)
-
-    def _end_media_player(self):
-        value = self.pop('media_player')
-        context = self._getContext()
-        context['media_player']['content'] = value
-
-if _XML_AVAILABLE:
-    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
-        def __init__(self, baseuri, baselang, encoding):
-            if _debug: sys.stderr.write('trying StrictFeedParser\n')
-            xml.sax.handler.ContentHandler.__init__(self)
-            _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
-            self.bozo = 0
-            self.exc = None
-            self.decls = {}
-        
-        def startPrefixMapping(self, prefix, uri):
-            self.trackNamespace(prefix, uri)
-            if uri == 'http://www.w3.org/1999/xlink':
-              self.decls['xmlns:'+prefix] = uri
-        
-        def startElementNS(self, name, qname, attrs):
-            namespace, localname = name
-            lowernamespace = str(namespace or '').lower()
-            if lowernamespace.find('backend.userland.com/rss') <> -1:
-                # match any backend.userland.com namespace
-                namespace = 'http://backend.userland.com/rss'
-                lowernamespace = namespace
-            if qname and qname.find(':') > 0:
-                givenprefix = qname.split(':')[0]
-            else:
-                givenprefix = None
-            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
-            if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
-                    raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
-            localname = str(localname).lower()
-
-            # qname implementation is horribly broken in Python 2.1 (it
-            # doesn't report any), and slightly broken in Python 2.2 (it
-            # doesn't report the xml: namespace). So we match up namespaces
-            # with a known list first, and then possibly override them with
-            # the qnames the SAX parser gives us (if indeed it gives us any
-            # at all).  Thanks to MatejC for helping me test this and
-            # tirelessly telling me that it didn't work yet.
-            attrsD, self.decls = self.decls, {}
-            if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
-                attrsD['xmlns']=namespace
-            if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
-                attrsD['xmlns']=namespace
-
-            if prefix:
-                localname = prefix.lower() + ':' + localname
-            elif namespace and not qname: #Expat
-                for name,value in self.namespacesInUse.items():
-                     if name and value == namespace:
-                         localname = name + ':' + localname
-                         break
-            if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
-
-            for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
-                lowernamespace = (namespace or '').lower()
-                prefix = self._matchnamespaces.get(lowernamespace, '')
-                if prefix:
-                    attrlocalname = prefix + ':' + attrlocalname
-                attrsD[str(attrlocalname).lower()] = attrvalue
-            for qname in attrs.getQNames():
-                attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
-            self.unknown_starttag(localname, attrsD.items())
-
-        def characters(self, text):
-            self.handle_data(text)
-
-        def endElementNS(self, name, qname):
-            namespace, localname = name
-            lowernamespace = str(namespace or '').lower()
-            if qname and qname.find(':') > 0:
-                givenprefix = qname.split(':')[0]
-            else:
-                givenprefix = ''
-            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
-            if prefix:
-                localname = prefix + ':' + localname
-            elif namespace and not qname: #Expat
-                for name,value in self.namespacesInUse.items():
-                     if name and value == namespace:
-                         localname = name + ':' + localname
-                         break
-            localname = str(localname).lower()
-            self.unknown_endtag(localname)
-
-        def error(self, exc):
-            self.bozo = 1
-            self.exc = exc
-
-        def fatalError(self, exc):
-            self.error(exc)
-            raise exc
-
-class _BaseHTMLProcessor(sgmllib.SGMLParser):
-    special = re.compile('''[<>'"]''')
-    bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
-    elements_no_end_tag = [
-      'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 
-      'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
-      'source', 'track', 'wbr'
-    ]
-
-    def __init__(self, encoding, type):
-        self.encoding = encoding
-        self.type = type
-        if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
-        sgmllib.SGMLParser.__init__(self)
-
-    def reset(self):
-        self.pieces = []
-        sgmllib.SGMLParser.reset(self)
-
-    def _shorttag_replace(self, match):
-        tag = match.group(1)
-        if tag in self.elements_no_end_tag:
-            return '<' + tag + ' />'
-        else:
-            return '<' + tag + '></' + tag + '>'
-
-    def parse_starttag(self,i):
-        j=sgmllib.SGMLParser.parse_starttag(self, i)
-        if self.type == 'application/xhtml+xml':
-            if j>2 and self.rawdata[j-2:j]=='/>':
-                self.unknown_endtag(self.lasttag)
-        return j
-
-    def feed(self, data):
-        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
-        #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
-        data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) 
-        data = data.replace('&#39;', "'")
-        data = data.replace('&#34;', '"')
-        if self.encoding and type(data) == type(u''):
-            data = data.encode(self.encoding)
-        sgmllib.SGMLParser.feed(self, data)
-        sgmllib.SGMLParser.close(self)
-
-    def normalize_attrs(self, attrs):
-        if not attrs: return attrs
-        # utility method to be called by descendants
-        attrs = dict([(k.lower(), v) for k, v in attrs]).items()
-        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
-        attrs.sort()
-        return attrs
-
-    def unknown_starttag(self, tag, attrs):
-        # called for each start tag
-        # attrs is a list of (attr, value) tuples
-        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
-        uattrs = []
-        strattrs=''
-        if attrs:
-            for key, value in attrs:
-                value=value.replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
-                value = self.bare_ampersand.sub("&amp;", value)
-                # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
-                if type(value) != type(u''):
-                    try:
-                        value = unicode(value, self.encoding)
-                    except:
-                        value = unicode(value, 'iso-8859-1')
-                uattrs.append((unicode(key, self.encoding), value))
-            strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
-            if self.encoding:
-                try:
-                    strattrs=strattrs.encode(self.encoding)
-                except:
-                    pass
-        if tag in self.elements_no_end_tag:
-            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
-        else:
-            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
-
-    def unknown_endtag(self, tag):
-        # called for each end tag, e.g. for </pre>, tag will be 'pre'
-        # Reconstruct the original end tag.
-        if tag not in self.elements_no_end_tag:
-            self.pieces.append("</%(tag)s>" % locals())
-
-    def handle_charref(self, ref):
-        # called for each character reference, e.g. for '&#160;', ref will be '160'
-        # Reconstruct the original character reference.
-        if ref.startswith('x'):
-            value = unichr(int(ref[1:],16))
-        else:
-            value = unichr(int(ref))
-
-        if value in _cp1252.keys():
-            self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
-        else:
-            self.pieces.append('&#%(ref)s;' % locals())
-        
-    def handle_entityref(self, ref):
-        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
-        # Reconstruct the original entity reference.
-        if name2codepoint.has_key(ref):
-            self.pieces.append('&%(ref)s;' % locals())
-        else:
-            self.pieces.append('&amp;%(ref)s' % locals())
-
-    def handle_data(self, text):
-        # called for each block of plain text, i.e. outside of any tag and
-        # not containing any character or entity references
-        # Store the original text verbatim.
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
-        self.pieces.append(text)
-        
-    def handle_comment(self, text):
-        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
-        # Reconstruct the original comment.
-        self.pieces.append('<!--%(text)s-->' % locals())
-        
-    def handle_pi(self, text):
-        # called for each processing instruction, e.g. <?instruction>
-        # Reconstruct original processing instruction.
-        self.pieces.append('<?%(text)s>' % locals())
-
-    def handle_decl(self, text):
-        # called for the DOCTYPE, if present, e.g.
-        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
-        #     "http://www.w3.org/TR/html4/loose.dtd">
-        # Reconstruct original DOCTYPE
-        self.pieces.append('<!%(text)s>' % locals())
-        
-    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
-    def _scan_name(self, i, declstartpos):
-        rawdata = self.rawdata
-        n = len(rawdata)
-        if i == n:
-            return None, -1
-        m = self._new_declname_match(rawdata, i)
-        if m:
-            s = m.group()
-            name = s.strip()
-            if (i + len(s)) == n:
-                return None, -1  # end of buffer
-            return name.lower(), m.end()
-        else:
-            self.handle_data(rawdata)
-#            self.updatepos(declstartpos, i)
-            return None, -1
-
-    def convert_charref(self, name):
-        return '&#%s;' % name
-
-    def convert_entityref(self, name):
-        return '&%s;' % name
-
-    def output(self):
-        '''Return processed HTML as a single string'''
-        return ''.join([str(p) for p in self.pieces])
-
-class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
-    def __init__(self, baseuri, baselang, encoding, entities):
-        sgmllib.SGMLParser.__init__(self)
-        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
-        _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
-        self.entities=entities
-
-    def decodeEntities(self, element, data):
-        data = data.replace('&#60;', '&lt;')
-        data = data.replace('&#x3c;', '&lt;')
-        data = data.replace('&#x3C;', '&lt;')
-        data = data.replace('&#62;', '&gt;')
-        data = data.replace('&#x3e;', '&gt;')
-        data = data.replace('&#x3E;', '&gt;')
-        data = data.replace('&#38;', '&amp;')
-        data = data.replace('&#x26;', '&amp;')
-        data = data.replace('&#34;', '&quot;')
-        data = data.replace('&#x22;', '&quot;')
-        data = data.replace('&#39;', '&apos;')
-        data = data.replace('&#x27;', '&apos;')
-        if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
-            data = data.replace('&lt;', '<')
-            data = data.replace('&gt;', '>')
-            data = data.replace('&amp;', '&')
-            data = data.replace('&quot;', '"')
-            data = data.replace('&apos;', "'")
-        return data
-        
-    def strattrs(self, attrs):
-        return ''.join([' %s="%s"' % (n,v.replace('"','&quot;')) for n,v in attrs])
-
-class _MicroformatsParser:
-    STRING = 1
-    DATE = 2
-    URI = 3
-    NODE = 4
-    EMAIL = 5
-
-    known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
-    known_binary_extensions =  ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
-
-    def __init__(self, data, baseuri, encoding):
-        self.document = BeautifulSoup.BeautifulSoup(data)
-        self.baseuri = baseuri
-        self.encoding = encoding
-        if type(data) == type(u''):
-            data = data.encode(encoding)
-        self.tags = []
-        self.enclosures = []
-        self.xfn = []
-        self.vcard = None
-    
-    def vcardEscape(self, s):
-        if type(s) in (type(''), type(u'')):
-            s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
-        return s
-    
-    def vcardFold(self, s):
-        s = re.sub(';+$', '', s)
-        sFolded = ''
-        iMax = 75
-        sPrefix = ''
-        while len(s) > iMax:
-            sFolded += sPrefix + s[:iMax] + '\n'
-            s = s[iMax:]
-            sPrefix = ' '
-            iMax = 74
-        sFolded += sPrefix + s
-        return sFolded
-
-    def normalize(self, s):
-        return re.sub(r'\s+', ' ', s).strip()
-    
-    def unique(self, aList):
-        results = []
-        for element in aList:
-            if element not in results:
-                results.append(element)
-        return results
-    
-    def toISO8601(self, dt):
-        return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
-
-    def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
-        all = lambda x: 1
-        sProperty = sProperty.lower()
-        bFound = 0
-        bNormalize = 1
-        propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
-        if bAllowMultiple and (iPropertyType != self.NODE):
-            snapResults = []
-            containers = elmRoot(['ul', 'ol'], propertyMatch)
-            for container in containers:
-                snapResults.extend(container('li'))
-            bFound = (len(snapResults) != 0)
-        if not bFound:
-            snapResults = elmRoot(all, propertyMatch)
-            bFound = (len(snapResults) != 0)
-        if (not bFound) and (sProperty == 'value'):
-            snapResults = elmRoot('pre')
-            bFound = (len(snapResults) != 0)
-            bNormalize = not bFound
-            if not bFound:
-                snapResults = [elmRoot]
-                bFound = (len(snapResults) != 0)
-        arFilter = []
-        if sProperty == 'vcard':
-            snapFilter = elmRoot(all, propertyMatch)
-            for node in snapFilter:
-                if node.findParent(all, propertyMatch):
-                    arFilter.append(node)
-        arResults = []
-        for node in snapResults:
-            if node not in arFilter:
-                arResults.append(node)
-        bFound = (len(arResults) != 0)
-        if not bFound:
-            if bAllowMultiple: return []
-            elif iPropertyType == self.STRING: return ''
-            elif iPropertyType == self.DATE: return None
-            elif iPropertyType == self.URI: return ''
-            elif iPropertyType == self.NODE: return None
-            else: return None
-        arValues = []
-        for elmResult in arResults:
-            sValue = None
-            if iPropertyType == self.NODE:
-                if bAllowMultiple:
-                    arValues.append(elmResult)
-                    continue
-                else:
-                    return elmResult
-            sNodeName = elmResult.name.lower()
-            if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
-                sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
-            if sValue:
-                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
-            if (not sValue) and (sNodeName == 'abbr'):
-                sValue = elmResult.get('title')
-            if sValue:
-                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
-            if (not sValue) and (iPropertyType == self.URI):
-                if sNodeName == 'a': sValue = elmResult.get('href')
-                elif sNodeName == 'img': sValue = elmResult.get('src')
-                elif sNodeName == 'object': sValue = elmResult.get('data')
-            if sValue:
-                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
-            if (not sValue) and (sNodeName == 'img'):
-                sValue = elmResult.get('alt')
-            if sValue:
-                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
-            if not sValue:
-                sValue = elmResult.renderContents()
-                sValue = re.sub(r'<\S[^>]*>', '', sValue)
-                sValue = sValue.replace('\r\n', '\n')
-                sValue = sValue.replace('\r', '\n')
-            if sValue:
-                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
-            if not sValue: continue
-            if iPropertyType == self.DATE:
-                sValue = _parse_date_iso8601(sValue)
-            if bAllowMultiple:
-                arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
-            else:
-                return bAutoEscape and self.vcardEscape(sValue) or sValue
-        return arValues
-
-    def findVCards(self, elmRoot, bAgentParsing=0):
-        sVCards = ''
-        
-        if not bAgentParsing:
-            arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
-        else:
-            arCards = [elmRoot]
-            
-        for elmCard in arCards:
-            arLines = []
-            
-            def processSingleString(sProperty):
-                sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1)
-                if sValue:
-                    arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
-                return sValue or ''
-            
-            def processSingleURI(sProperty):
-                sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
-                if sValue:
-                    sContentType = ''
-                    sEncoding = ''
-                    sValueKey = ''
-                    if sValue.startswith('data:'):
-                        sEncoding = ';ENCODING=b'
-                        sContentType = sValue.split(';')[0].split('/').pop()
-                        sValue = sValue.split(',', 1).pop()
-                    else:
-                        elmValue = self.getPropertyValue(elmCard, sProperty)
-                        if elmValue:
-                            if sProperty != 'url':
-                                sValueKey = ';VALUE=uri'
-                            sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
-                    sContentType = sContentType.upper()
-                    if sContentType == 'OCTET-STREAM':
-                        sContentType = ''
-                    if sContentType:
-                        sContentType = ';TYPE=' + sContentType.upper()
-                    arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
-    
-            def processTypeValue(sProperty, arDefaultType, arForceType=None):
-                arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
-                for elmResult in arResults:
-                    arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
-                    if arForceType:
-                        arType = self.unique(arForceType + arType)
-                    if not arType:
-                        arType = arDefaultType
-                    sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
-                    if sValue:
-                        arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
-            
-            # AGENT
-            # must do this before all other properties because it is destructive
-            # (removes nested class="vcard" nodes so they don't interfere with
-            # this vcard's other properties)
-            arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
-            for elmAgent in arAgent:
-                if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
-                    sAgentValue = self.findVCards(elmAgent, 1) + '\n'
-                    sAgentValue = sAgentValue.replace('\n', '\\n')
-                    sAgentValue = sAgentValue.replace(';', '\\;')
-                    if sAgentValue:
-                        arLines.append(self.vcardFold('AGENT:' + sAgentValue))
-                    elmAgent['class'] = ''
-                    elmAgent.contents = []
-                else:
-                    sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
-                    if sAgentValue:
-                        arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
-    
-            # FN (full name)
-            sFN = processSingleString('fn')
-            
-            # N (name)
-            elmName = self.getPropertyValue(elmCard, 'n')
-            if elmName:
-                sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
-                sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
-                arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
-                arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
-                arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
-                arLines.append(self.vcardFold('N:' + sFamilyName + ';' + 
-                                         sGivenName + ';' +
-                                         ','.join(arAdditionalNames) + ';' +
-                                         ','.join(arHonorificPrefixes) + ';' +
-                                         ','.join(arHonorificSuffixes)))
-            elif sFN:
-                # implied "N" optimization
-                # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
-                arNames = self.normalize(sFN).split()
-                if len(arNames) == 2:
-                    bFamilyNameFirst = (arNames[0].endswith(',') or
-                                        len(arNames[1]) == 1 or
-                                        ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
-                    if bFamilyNameFirst:
-                        arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
-                    else:
-                        arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
-    
-            # SORT-STRING
-            sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
-            if sSortString:
-                arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
-            
-            # NICKNAME
-            arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
-            if arNickname:
-                arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
-            
-            # PHOTO
-            processSingleURI('photo')
-            
-            # BDAY
-            dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
-            if dtBday:
-                arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
-            
-            # ADR (address)
-            arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
-            for elmAdr in arAdr:
-                arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
-                if not arType:
-                    arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
-                sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
-                sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
-                sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
-                sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
-                sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
-                sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
-                sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
-                arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
-                                         sPostOfficeBox + ';' +
-                                         sExtendedAddress + ';' +
-                                         sStreetAddress + ';' +
-                                         sLocality + ';' +
-                                         sRegion + ';' +
-                                         sPostalCode + ';' +
-                                         sCountryName))
-            
-            # LABEL
-            processTypeValue('label', ['intl','postal','parcel','work'])
-            
-            # TEL (phone number)
-            processTypeValue('tel', ['voice'])
-            
-            # EMAIL
-            processTypeValue('email', ['internet'], ['internet'])
-            
-            # MAILER
-            processSingleString('mailer')
-            
-            # TZ (timezone)
-            processSingleString('tz')
-    
-            # GEO (geographical information)
-            elmGeo = self.getPropertyValue(elmCard, 'geo')
-            if elmGeo:
-                sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
-                sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
-                arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
-    
-            # TITLE
-            processSingleString('title')
-    
-            # ROLE
-            processSingleString('role')
-
-            # LOGO
-            processSingleURI('logo')
-    
-            # ORG (organization)
-            elmOrg = self.getPropertyValue(elmCard, 'org')
-            if elmOrg:
-                sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
-                if not sOrganizationName:
-                    # implied "organization-name" optimization
-                    # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
-                    sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
-                    if sOrganizationName:
-                        arLines.append(self.vcardFold('ORG:' + sOrganizationName))
-                else:
-                    arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
-                    arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
-    
-            # CATEGORY
-            arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
-            if arCategory:
-                arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
-    
-            # NOTE
-            processSingleString('note')
-    
-            # REV
-            processSingleString('rev')
-    
-            # SOUND
-            processSingleURI('sound')
-    
-            # UID
-            processSingleString('uid')
-    
-            # URL
-            processSingleURI('url')
-    
-            # CLASS
-            processSingleString('class')
-    
-            # KEY
-            processSingleURI('key')
-    
-            if arLines:
-                arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard']
-                sVCards += '\n'.join(arLines) + '\n'
-    
-        return sVCards.strip()
-    
-    def isProbablyDownloadable(self, elm):
-        attrsD = elm.attrMap
-        if not attrsD.has_key('href'): return 0
-        linktype = attrsD.get('type', '').strip()
-        if linktype.startswith('audio/') or \
-           linktype.startswith('video/') or \
-           (linktype.startswith('application/') and not linktype.endswith('xml')):
-            return 1
-        path = urlparse.urlparse(attrsD['href'])[2]
-        if path.find('.') == -1: return 0
-        fileext = path.split('.').pop().lower()
-        return fileext in self.known_binary_extensions
-
-    def findTags(self):
-        all = lambda x: 1
-        for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
-            href = elm.get('href')
-            if not href: continue
-            urlscheme, domain, path, params, query, fragment = \
-                       urlparse.urlparse(_urljoin(self.baseuri, href))
-            segments = path.split('/')
-            tag = segments.pop()
-            if not tag:
-                tag = segments.pop()
-            tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
-            if not tagscheme.endswith('/'):
-                tagscheme += '/'
-            self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
-
-    def findEnclosures(self):
-        all = lambda x: 1
-        enclosure_match = re.compile(r'\benclosure\b')
-        for elm in self.document(all, {'href': re.compile(r'.+')}):
-            if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue
-            if elm.attrMap not in self.enclosures:
-                self.enclosures.append(elm.attrMap)
-                if elm.string and not elm.get('title'):
-                    self.enclosures[-1]['title'] = elm.string
-
-    def findXFN(self):
-        all = lambda x: 1
-        for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
-            rels = elm.get('rel', '').split()
-            xfn_rels = []
-            for rel in rels:
-                if rel in self.known_xfn_relationships:
-                    xfn_rels.append(rel)
-            if xfn_rels:
-                self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
-
-def _parseMicroformats(htmlSource, baseURI, encoding):
-    if not BeautifulSoup: return
-    if _debug: sys.stderr.write('entering _parseMicroformats\n')
-    p = _MicroformatsParser(htmlSource, baseURI, encoding)
-    p.vcard = p.findVCards(p.document)
-    p.findTags()
-    p.findEnclosures()
-    p.findXFN()
-    return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
-
-class _RelativeURIResolver(_BaseHTMLProcessor):
-    relative_uris = [('a', 'href'),
-                     ('applet', 'codebase'),
-                     ('area', 'href'),
-                     ('blockquote', 'cite'),
-                     ('body', 'background'),
-                     ('del', 'cite'),
-                     ('form', 'action'),
-                     ('frame', 'longdesc'),
-                     ('frame', 'src'),
-                     ('iframe', 'longdesc'),
-                     ('iframe', 'src'),
-                     ('head', 'profile'),
-                     ('img', 'longdesc'),
-                     ('img', 'src'),
-                     ('img', 'usemap'),
-                     ('input', 'src'),
-                     ('input', 'usemap'),
-                     ('ins', 'cite'),
-                     ('link', 'href'),
-                     ('object', 'classid'),
-                     ('object', 'codebase'),
-                     ('object', 'data'),
-                     ('object', 'usemap'),
-                     ('q', 'cite'),
-                     ('script', 'src')]
-
-    def __init__(self, baseuri, encoding, type):
-        _BaseHTMLProcessor.__init__(self, encoding, type)
-        self.baseuri = baseuri
-
-    def resolveURI(self, uri):
-        return _urljoin(self.baseuri, uri.strip())
-    
-    def unknown_starttag(self, tag, attrs):
-        if _debug:
-            sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
-        attrs = self.normalize_attrs(attrs)
-        attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
-        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-
-def _resolveRelativeURIs(htmlSource, baseURI, encoding, type):
-    if _debug:
-        sys.stderr.write('entering _resolveRelativeURIs\n')
-
-    p = _RelativeURIResolver(baseURI, encoding, type)
-    p.feed(htmlSource)
-    return p.output()
-
-class _HTMLSanitizer(_BaseHTMLProcessor):
-    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article',
-      'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas',
-      'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command',
-      'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
-      'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
-      'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
-      'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
-      'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
-      'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
-      'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
-      'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead',
-      'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
-
-    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
-      'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
-      'background', 'balance', 'bgcolor', 'bgproperties', 'border',
-      'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
-      'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
-      'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
-      'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
-      'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
-      'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
-      'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
-      'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
-      'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
-      'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
-      'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
-      'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
-      'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
-      'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
-      'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
-      'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
-      'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
-      'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
-      'xml:lang']
-
-    unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
-
-    acceptable_css_properties = ['azimuth', 'background-color',
-      'border-bottom-color', 'border-collapse', 'border-color',
-      'border-left-color', 'border-right-color', 'border-top-color', 'clear',
-      'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
-      'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
-      'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
-      'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
-      'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
-      'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
-      'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
-      'white-space', 'width']
-
-    # survey of common keywords found in feeds
-    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
-      'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
-      'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
-      'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
-      'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
-      'transparent', 'underline', 'white', 'yellow']
-
-    valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
-      '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
-
-    mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
-      'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
-      'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
-      'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
-      'munderover', 'none', 'semantics']
-
-    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
-      'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
-      'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
-      'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
-      'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
-      'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
-      'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
-      'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
-      'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
-
-    # svgtiny - foreignObject + linearGradient + radialGradient + stop
-    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
-      'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
-      'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 
-      'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
-      'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
-      'svg', 'switch', 'text', 'title', 'tspan', 'use']
-
-    # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
-    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
-       'arabic-form', 'ascent', 'attributeName', 'attributeType',
-       'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
-       'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
-       'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
-       'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
-       'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
-       'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
-       'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
-       'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
-       'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
-       'min', 'name', 'offset', 'opacity', 'orient', 'origin',
-       'overline-position', 'overline-thickness', 'panose-1', 'path',
-       'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
-       'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
-       'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
-       'stop-color', 'stop-opacity', 'strikethrough-position',
-       'strikethrough-thickness', 'stroke', 'stroke-dasharray',
-       'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
-       'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
-       'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
-       'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
-       'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
-       'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
-       'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
-       'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
-       'y2', 'zoomAndPan']
-
-    svg_attr_map = None
-    svg_elem_map = None
-
-    acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
-      'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
-      'stroke-opacity']
-
-    def reset(self):
-        _BaseHTMLProcessor.reset(self)
-        self.unacceptablestack = 0
-        self.mathmlOK = 0
-        self.svgOK = 0
-        
-    def unknown_starttag(self, tag, attrs):
-        acceptable_attributes = self.acceptable_attributes
-        keymap = {}
-        if not tag in self.acceptable_elements or self.svgOK:
-            if tag in self.unacceptable_elements_with_end_tag:
-                self.unacceptablestack += 1
-
-            # add implicit namespaces to html5 inline svg/mathml
-            if self.type.endswith('html'):
-                if tag=='svg':
-                    if not dict(attrs).get('xmlns'):
-                        attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
-                if tag=='math':
-                    if not dict(attrs).get('xmlns'):
-                        attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
-
-            # not otherwise acceptable, perhaps it is MathML or SVG?
-            if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
-                self.mathmlOK += 1
-            if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
-                self.svgOK += 1
-
-            # chose acceptable attributes based on tag class, else bail
-            if  self.mathmlOK and tag in self.mathml_elements:
-                acceptable_attributes = self.mathml_attributes
-            elif self.svgOK and tag in self.svg_elements:
-                # for most vocabularies, lowercasing is a good idea.  Many
-                # svg elements, however, are camel case
-                if not self.svg_attr_map:
-                    lower=[attr.lower() for attr in self.svg_attributes]
-                    mix=[a for a in self.svg_attributes if a not in lower]
-                    self.svg_attributes = lower
-                    self.svg_attr_map = dict([(a.lower(),a) for a in mix])
-
-                    lower=[attr.lower() for attr in self.svg_elements]
-                    mix=[a for a in self.svg_elements if a not in lower]
-                    self.svg_elements = lower
-                    self.svg_elem_map = dict([(a.lower(),a) for a in mix])
-                acceptable_attributes = self.svg_attributes
-                tag = self.svg_elem_map.get(tag,tag)
-                keymap = self.svg_attr_map
-            elif not tag in self.acceptable_elements:
-                return
-
-        # declare xlink namespace, if needed
-        if self.mathmlOK or self.svgOK:
-            if filter(lambda (n,v): n.startswith('xlink:'),attrs):
-                if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
-                    attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
-
-        clean_attrs = []
-        for key, value in self.normalize_attrs(attrs):
-            if key in acceptable_attributes:
-                key=keymap.get(key,key)
-                clean_attrs.append((key,value))
-            elif key=='style':
-                clean_value = self.sanitize_style(value)
-                if clean_value: clean_attrs.append((key,clean_value))
-        _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
-        
-    def unknown_endtag(self, tag):
-        if not tag in self.acceptable_elements:
-            if tag in self.unacceptable_elements_with_end_tag:
-                self.unacceptablestack -= 1
-            if self.mathmlOK and tag in self.mathml_elements:
-                if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1
-            elif self.svgOK and tag in self.svg_elements:
-                tag = self.svg_elem_map.get(tag,tag)
-                if tag == 'svg' and self.svgOK: self.svgOK -= 1
-            else:
-                return
-        _BaseHTMLProcessor.unknown_endtag(self, tag)
-
-    def handle_pi(self, text):
-        pass
-
-    def handle_decl(self, text):
-        pass
-
-    def handle_data(self, text):
-        if not self.unacceptablestack:
-            _BaseHTMLProcessor.handle_data(self, text)
-
-    def sanitize_style(self, style):
-        # disallow urls
-        style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
-
-        # gauntlet
-        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
-        # This replaced a regexp that used re.match and was prone to pathological back-tracking.
-        if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''
-
-        clean = []
-        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
-          if not value: continue
-          if prop.lower() in self.acceptable_css_properties:
-              clean.append(prop + ': ' + value + ';')
-          elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
-              for keyword in value.split():
-                  if not keyword in self.acceptable_css_keywords and \
-                      not self.valid_css_values.match(keyword):
-                      break
-              else:
-                  clean.append(prop + ': ' + value + ';')
-          elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
-              clean.append(prop + ': ' + value + ';')
-
-        return ' '.join(clean)
-
-
-def _sanitizeHTML(htmlSource, encoding, type):
-    p = _HTMLSanitizer(encoding, type)
-    p.feed(htmlSource)
-    data = p.output()
-    if TIDY_MARKUP:
-        # loop through list of preferred Tidy interfaces looking for one that's installed,
-        # then set up a common _tidy function to wrap the interface-specific API.
-        _tidy = None
-        for tidy_interface in PREFERRED_TIDY_INTERFACES:
-            try:
-                if tidy_interface == "uTidy":
-                    from tidy import parseString as _utidy
-                    def _tidy(data, **kwargs):
-                        return str(_utidy(data, **kwargs))
-                    break
-                elif tidy_interface == "mxTidy":
-                    from mx.Tidy import Tidy as _mxtidy
-                    def _tidy(data, **kwargs):
-                        nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
-                        return data
-                    break
-            except:
-                pass
-        if _tidy:
-            utf8 = type(data) == type(u'')
-            if utf8:
-                data = data.encode('utf-8')
-            data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
-            if utf8:
-                data = unicode(data, 'utf-8')
-            if data.count('<body'):
-                data = data.split('<body', 1)[1]
-                if data.count('>'):
-                    data = data.split('>', 1)[1]
-            if data.count('</body'):
-                data = data.split('</body', 1)[0]
-    data = data.strip().replace('\r\n', '\n')
-    return data
-
-class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
-    def http_error_default(self, req, fp, code, msg, headers):
-        if ((code / 100) == 3) and (code != 304):
-            return self.http_error_302(req, fp, code, msg, headers)
-        infourl = urllib.addinfourl(fp, headers, req.get_full_url())
-        infourl.status = code
-        return infourl
-
-    def http_error_302(self, req, fp, code, msg, headers):
-        if headers.dict.has_key('location'):
-            infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
-        else:
-            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
-        if not hasattr(infourl, 'status'):
-            infourl.status = code
-        return infourl
-
-    def http_error_301(self, req, fp, code, msg, headers):
-        if headers.dict.has_key('location'):
-            infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
-        else:
-            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
-        if not hasattr(infourl, 'status'):
-            infourl.status = code
-        return infourl
-
-    http_error_300 = http_error_302
-    http_error_303 = http_error_302
-    http_error_307 = http_error_302
-        
-    def http_error_401(self, req, fp, code, msg, headers):
-        # Check if
-        # - server requires digest auth, AND
-        # - we tried (unsuccessfully) with basic auth, AND
-        # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
-        # If all conditions hold, parse authentication information
-        # out of the Authorization header we sent the first time
-        # (for the username and password) and the WWW-Authenticate
-        # header the server sent back (for the realm) and retry
-        # the request with the appropriate digest auth headers instead.
-        # This evil genius hack has been brought to you by Aaron Swartz.
-        host = urlparse.urlparse(req.get_full_url())[1]
-        try:
-            assert sys.version.split()[0] >= '2.3.3'
-            assert base64 != None
-            user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
-            realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
-            self.add_password(realm, host, user, passw)
-            retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
-            self.reset_retry_count()
-            return retry
-        except:
-            return self.http_error_default(req, fp, code, msg, headers)
-
-def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
-    """URL, filename, or string --> stream
-
-    This function lets you define parsers that take any input source
-    (URL, pathname to local or network file, or actual data as a string)
-    and deal with it in a uniform manner.  Returned object is guaranteed
-    to have all the basic stdio read methods (read, readline, readlines).
-    Just .close() the object when you're done with it.
-
-    If the etag argument is supplied, it will be used as the value of an
-    If-None-Match request header.
-
-    If the modified argument is supplied, it can be a tuple of 9 integers
-    (as returned by gmtime() in the standard Python time module) or a date
-    string in any format supported by feedparser. Regardless, it MUST
-    be in GMT (Greenwich Mean Time). It will be reformatted into an
-    RFC 1123-compliant date and used as the value of an If-Modified-Since
-    request header.
-
-    If the agent argument is supplied, it will be used as the value of a
-    User-Agent request header.
-
-    If the referrer argument is supplied, it will be used as the value of a
-    Referer[sic] request header.
-
-    If handlers is supplied, it is a list of handlers used to build a
-    urllib2 opener.
-    """
-
-    if hasattr(url_file_stream_or_string, 'read'):
-        return url_file_stream_or_string
-
-    if url_file_stream_or_string == '-':
-        return sys.stdin
-
-    if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
-        if not agent:
-            agent = USER_AGENT
-        # test for inline user:password for basic auth
-        auth = None
-        if base64:
-            urltype, rest = urllib.splittype(url_file_stream_or_string)
-            realhost, rest = urllib.splithost(rest)
-            if realhost:
-                user_passwd, realhost = urllib.splituser(realhost)
-                if user_passwd:
-                    url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
-                    auth = base64.encodestring(user_passwd).strip()
-
-        # iri support
-        try:
-            if isinstance(url_file_stream_or_string,unicode):
-                url_file_stream_or_string = url_file_stream_or_string.encode('idna')
-            else:
-                url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna')
-        except:
-            pass
-
-        # try to open with urllib2 (to use optional headers)
-        request = urllib2.Request(url_file_stream_or_string)
-        request.add_header('User-Agent', agent)
-        if etag:
-            request.add_header('If-None-Match', etag)
-        if type(modified) == type(''):
-            modified = _parse_date(modified)
-        if modified:
-            # format into an RFC 1123-compliant timestamp. We can't use
-            # time.strftime() since the %a and %b directives can be affected
-            # by the current locale, but RFC 2616 states that dates must be
-            # in English.
-            short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
-            months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
-            request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
-        if referrer:
-            request.add_header('Referer', referrer)
-        if gzip and zlib:
-            request.add_header('Accept-encoding', 'gzip, deflate')
-        elif gzip:
-            request.add_header('Accept-encoding', 'gzip')
-        elif zlib:
-            request.add_header('Accept-encoding', 'deflate')
-        else:
-            request.add_header('Accept-encoding', '')
-        if auth:
-            request.add_header('Authorization', 'Basic %s' % auth)
-        if ACCEPT_HEADER:
-            request.add_header('Accept', ACCEPT_HEADER)
-        request.add_header('A-IM', 'feed') # RFC 3229 support
-        opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
-        opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
-        try:
-            return opener.open(request)
-        finally:
-            opener.close() # JohnD
-    
-    # try to open with native open function (if url_file_stream_or_string is a filename)
-    try:
-        return open(url_file_stream_or_string)
-    except:
-        pass
-
-    # treat url_file_stream_or_string as string
-    return _StringIO(str(url_file_stream_or_string))
-
-_date_handlers = []
-def registerDateHandler(func):
-    '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
-    _date_handlers.insert(0, func)
-    
-# ISO-8601 date parsing routines written by Fazal Majid.
-# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
-# parser is beyond the scope of feedparser and would be a worthwhile addition
-# to the Python library.
-# A single regular expression cannot parse ISO 8601 date formats into groups
-# as the standard is highly irregular (for instance is 030104 2003-01-04 or
-# 0301-04-01), so we use templates instead.
-# Please note the order in templates is significant because we need a
-# greedy match.
-_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
-                'YY-?MM-?DD', 'YY-?OOO', 'YYYY', 
-                '-YY-?MM', '-OOO', '-YY',
-                '--MM-?DD', '--MM',
-                '---DD',
-                'CC', '']
-_iso8601_re = [
-    tmpl.replace(
-    'YYYY', r'(?P<year>\d{4})').replace(
-    'YY', r'(?P<year>\d\d)').replace(
-    'MM', r'(?P<month>[01]\d)').replace(
-    'DD', r'(?P<day>[0123]\d)').replace(
-    'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
-    'CC', r'(?P<century>\d\d$)')
-    + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
-    + r'(:(?P<second>\d{2}))?'
-    + r'(\.(?P<fracsecond>\d+))?'
-    + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
-    for tmpl in _iso8601_tmpl]
-del tmpl
-_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
-del regex
-def _parse_date_iso8601(dateString):
-    '''Parse a variety of ISO-8601-compatible formats like 20040105'''
-    m = None
-    for _iso8601_match in _iso8601_matches:
-        m = _iso8601_match(dateString)
-        if m: break
-    if not m: return
-    if m.span() == (0, 0): return
-    params = m.groupdict()
-    ordinal = params.get('ordinal', 0)
-    if ordinal:
-        ordinal = int(ordinal)
-    else:
-        ordinal = 0
-    year = params.get('year', '--')
-    if not year or year == '--':
-        year = time.gmtime()[0]
-    elif len(year) == 2:
-        # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
-        year = 100 * int(time.gmtime()[0] / 100) + int(year)
-    else:
-        year = int(year)
-    month = params.get('month', '-')
-    if not month or month == '-':
-        # ordinals are NOT normalized by mktime, we simulate them
-        # by setting month=1, day=ordinal
-        if ordinal:
-            month = 1
-        else:
-            month = time.gmtime()[1]
-    month = int(month)
-    day = params.get('day', 0)
-    if not day:
-        # see above
-        if ordinal:
-            day = ordinal
-        elif params.get('century', 0) or \
-                 params.get('year', 0) or params.get('month', 0):
-            day = 1
-        else:
-            day = time.gmtime()[2]
-    else:
-        day = int(day)
-    # special case of the century - is the first year of the 21st century
-    # 2000 or 2001 ? The debate goes on...
-    if 'century' in params.keys():
-        year = (int(params['century']) - 1) * 100 + 1
-    # in ISO 8601 most fields are optional
-    for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
-        if not params.get(field, None):
-            params[field] = 0
-    hour = int(params.get('hour', 0))
-    minute = int(params.get('minute', 0))
-    second = int(float(params.get('second', 0)))
-    # weekday is normalized by mktime(), we can ignore it
-    weekday = 0
-    daylight_savings_flag = -1
-    tm = [year, month, day, hour, minute, second, weekday,
-          ordinal, daylight_savings_flag]
-    # ISO 8601 time zone adjustments
-    tz = params.get('tz')
-    if tz and tz != 'Z':
-        if tz[0] == '-':
-            tm[3] += int(params.get('tzhour', 0))
-            tm[4] += int(params.get('tzmin', 0))
-        elif tz[0] == '+':
-            tm[3] -= int(params.get('tzhour', 0))
-            tm[4] -= int(params.get('tzmin', 0))
-        else:
-            return None
-    # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
-    # which is guaranteed to normalize d/m/y/h/m/s.
-    # Many implementations have bugs, but we'll pretend they don't.
-    return time.localtime(time.mktime(tm))
-registerDateHandler(_parse_date_iso8601)
-    
-# 8-bit date handling routines written by ytrewq1.
-_korean_year  = u'\ub144' # b3e2 in euc-kr
-_korean_month = u'\uc6d4' # bff9 in euc-kr
-_korean_day   = u'\uc77c' # c0cf in euc-kr
-_korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
-_korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
-
-_korean_onblog_date_re = \
-    re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
-               (_korean_year, _korean_month, _korean_day))
-_korean_nate_date_re = \
-    re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
-               (_korean_am, _korean_pm))
-def _parse_date_onblog(dateString):
-    '''Parse a string according to the OnBlog 8-bit date format'''
-    m = _korean_onblog_date_re.match(dateString)
-    if not m: return
-    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
-                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
-                 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
-                 'zonediff': '+09:00'}
-    if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
-    return _parse_date_w3dtf(w3dtfdate)
-registerDateHandler(_parse_date_onblog)
-
-def _parse_date_nate(dateString):
-    '''Parse a string according to the Nate 8-bit date format'''
-    m = _korean_nate_date_re.match(dateString)
-    if not m: return
-    hour = int(m.group(5))
-    ampm = m.group(4)
-    if (ampm == _korean_pm):
-        hour += 12
-    hour = str(hour)
-    if len(hour) == 1:
-        hour = '0' + hour
-    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
-                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
-                 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
-                 'zonediff': '+09:00'}
-    if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
-    return _parse_date_w3dtf(w3dtfdate)
-registerDateHandler(_parse_date_nate)
-
-_mssql_date_re = \
-    re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
-def _parse_date_mssql(dateString):
-    '''Parse a string according to the MS SQL date format'''
-    m = _mssql_date_re.match(dateString)
-    if not m: return
-    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
-                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
-                 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
-                 'zonediff': '+09:00'}
-    if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
-    return _parse_date_w3dtf(w3dtfdate)
-registerDateHandler(_parse_date_mssql)
-
-# Unicode strings for Greek date strings
-_greek_months = \
-  { \
-   u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
-   u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
-   u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
-   u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
-   u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
-   u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
-   u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
-   u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
-   u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
-   u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
-   u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
-   u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
-   u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
-   u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
-   u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
-   u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
-   u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
-   u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
-   u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
-  }
-
-_greek_wdays = \
-  { \
-   u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
-   u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
-   u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
-   u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
-   u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
-   u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
-   u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7   
-  }
-
-_greek_date_format_re = \
-    re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
-
-def _parse_date_greek(dateString):
-    '''Parse a string according to a Greek 8-bit date format.'''
-    m = _greek_date_format_re.match(dateString)
-    if not m: return
-    try:
-        wday = _greek_wdays[m.group(1)]
-        month = _greek_months[m.group(3)]
-    except:
-        return
-    rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
-                 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
-                  'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
-                  'zonediff': m.group(8)}
-    if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
-    return _parse_date_rfc822(rfc822date)
-registerDateHandler(_parse_date_greek)
-
-# Unicode strings for Hungarian date strings
-_hungarian_months = \
-  { \
-    u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
-    u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
-    u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
-    u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
-    u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
-    u'j\u00fanius':   u'06',  # fa in iso-8859-2
-    u'j\u00falius':   u'07',  # fa in iso-8859-2
-    u'augusztus':     u'08',
-    u'szeptember':    u'09',
-    u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
-    u'november':      u'11',
-    u'december':      u'12',
-  }
-
-_hungarian_date_format_re = \
-  re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
-
-def _parse_date_hungarian(dateString):
-    '''Parse a string according to a Hungarian 8-bit date format.'''
-    m = _hungarian_date_format_re.match(dateString)
-    if not m: return
-    try:
-        month = _hungarian_months[m.group(2)]
-        day = m.group(3)
-        if len(day) == 1:
-            day = '0' + day
-        hour = m.group(4)
-        if len(hour) == 1:
-            hour = '0' + hour
-    except:
-        return
-    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
-                {'year': m.group(1), 'month': month, 'day': day,\
-                 'hour': hour, 'minute': m.group(5),\
-                 'zonediff': m.group(6)}
-    if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
-    return _parse_date_w3dtf(w3dtfdate)
-registerDateHandler(_parse_date_hungarian)
-
-# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
-# Drake and licensed under the Python license.  Removed all range checking
-# for month, day, hour, minute, and second, since mktime will normalize
-# these later
-def _parse_date_w3dtf(dateString):
-    def __extract_date(m):
-        year = int(m.group('year'))
-        if year < 100:
-            year = 100 * int(time.gmtime()[0] / 100) + int(year)
-        if year < 1000:
-            return 0, 0, 0
-        julian = m.group('julian')
-        if julian:
-            julian = int(julian)
-            month = julian / 30 + 1
-            day = julian % 30 + 1
-            jday = None
-            while jday != julian:
-                t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
-                jday = time.gmtime(t)[-2]
-                diff = abs(jday - julian)
-                if jday > julian:
-                    if diff < day:
-                        day = day - diff
-                    else:
-                        month = month - 1
-                        day = 31
-                elif jday < julian:
-                    if day + diff < 28:
-                       day = day + diff
-                    else:
-                        month = month + 1
-            return year, month, day
-        month = m.group('month')
-        day = 1
-        if month is None:
-            month = 1
-        else:
-            month = int(month)
-            day = m.group('day')
-            if day:
-                day = int(day)
-            else:
-                day = 1
-        return year, month, day
-
-    def __extract_time(m):
-        if not m:
-            return 0, 0, 0
-        hours = m.group('hours')
-        if not hours:
-            return 0, 0, 0
-        hours = int(hours)
-        minutes = int(m.group('minutes'))
-        seconds = m.group('seconds')
-        if seconds:
-            seconds = int(seconds)
-        else:
-            seconds = 0
-        return hours, minutes, seconds
-
-    def __extract_tzd(m):
-        '''Return the Time Zone Designator as an offset in seconds from UTC.'''
-        if not m:
-            return 0
-        tzd = m.group('tzd')
-        if not tzd:
-            return 0
-        if tzd == 'Z':
-            return 0
-        hours = int(m.group('tzdhours'))
-        minutes = m.group('tzdminutes')
-        if minutes:
-            minutes = int(minutes)
-        else:
-            minutes = 0
-        offset = (hours*60 + minutes) * 60
-        if tzd[0] == '+':
-            return -offset
-        return offset
-
-    __date_re = ('(?P<year>\d\d\d\d)'
-                 '(?:(?P<dsep>-|)'
-                 '(?:(?P<julian>\d\d\d)'
-                 '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
-    __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
-    __tzd_rx = re.compile(__tzd_re)
-    __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
-                 '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
-                 + __tzd_re)
-    __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
-    __datetime_rx = re.compile(__datetime_re)
-    m = __datetime_rx.match(dateString)
-    if (m is None) or (m.group() != dateString): return
-    gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
-    if gmt[0] == 0: return
-    return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
-registerDateHandler(_parse_date_w3dtf)
-
-def _parse_date_rfc822(dateString):
-    '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
-    data = dateString.split()
-    if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
-        del data[0]
-    if len(data) == 4:
-        s = data[3]
-        i = s.find('+')
-        if i > 0:
-            data[3:] = [s[:i], s[i+1:]]
-        else:
-            data.append('')
-        dateString = " ".join(data)
-    if len(data) < 5:
-        dateString += ' 00:00:00 GMT'
-    tm = rfc822.parsedate_tz(dateString)
-    if tm:
-        return time.gmtime(rfc822.mktime_tz(tm))
-# rfc822.py defines several time zones, but we define some extra ones.
-# 'ET' is equivalent to 'EST', etc.
-_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
-rfc822._timezones.update(_additional_timezones)
-registerDateHandler(_parse_date_rfc822)    
-
-def _parse_date_perforce(aDateString):
-	"""parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
-	# Fri, 2006/09/15 08:19:53 EDT
-	_my_date_pattern = re.compile( \
-		r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
-
-	dow, year, month, day, hour, minute, second, tz = \
-		_my_date_pattern.search(aDateString).groups()
-	months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
-	dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
-	tm = rfc822.parsedate_tz(dateString)
-	if tm:
-		return time.gmtime(rfc822.mktime_tz(tm))
-registerDateHandler(_parse_date_perforce)
-
-def _parse_date(dateString):
-    '''Parses a variety of date formats into a 9-tuple in GMT'''
-    for handler in _date_handlers:
-        try:
-            date9tuple = handler(dateString)
-            if not date9tuple: continue
-            if len(date9tuple) != 9:
-                if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
-                raise ValueError
-            map(int, date9tuple)
-            return date9tuple
-        except Exception, e:
-            if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
-            pass
-    return None
-
-def _getCharacterEncoding(http_headers, xml_data):
-    '''Get the character encoding of the XML document
-
-    http_headers is a dictionary
-    xml_data is a raw string (not Unicode)
-    
-    This is so much trickier than it sounds, it's not even funny.
-    According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
-    is application/xml, application/*+xml,
-    application/xml-external-parsed-entity, or application/xml-dtd,
-    the encoding given in the charset parameter of the HTTP Content-Type
-    takes precedence over the encoding given in the XML prefix within the
-    document, and defaults to 'utf-8' if neither are specified.  But, if
-    the HTTP Content-Type is text/xml, text/*+xml, or
-    text/xml-external-parsed-entity, the encoding given in the XML prefix
-    within the document is ALWAYS IGNORED and only the encoding given in
-    the charset parameter of the HTTP Content-Type header should be
-    respected, and it defaults to 'us-ascii' if not specified.
-
-    Furthermore, discussion on the atom-syntax mailing list with the
-    author of RFC 3023 leads me to the conclusion that any document
-    served with a Content-Type of text/* and no charset parameter
-    must be treated as us-ascii.  (We now do this.)  And also that it
-    must always be flagged as non-well-formed.  (We now do this too.)
-    
-    If Content-Type is unspecified (input was local file or non-HTTP source)
-    or unrecognized (server just got it totally wrong), then go by the
-    encoding given in the XML prefix of the document and default to
-    'iso-8859-1' as per the HTTP specification (RFC 2616).
-    
-    Then, assuming we didn't find a character encoding in the HTTP headers
-    (and the HTTP Content-type allowed us to look in the body), we need
-    to sniff the first few bytes of the XML data and try to determine
-    whether the encoding is ASCII-compatible.  Section F of the XML
-    specification shows the way here:
-    http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
-
-    If the sniffed encoding is not ASCII-compatible, we need to make it
-    ASCII compatible so that we can sniff further into the XML declaration
-    to find the encoding attribute, which will tell us the true encoding.
-
-    Of course, none of this guarantees that we will be able to parse the
-    feed in the declared character encoding (assuming it was declared
-    correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
-    you should definitely install them if you can.
-    http://cjkpython.i18n.org/
-    '''
-
-    def _parseHTTPContentType(content_type):
-        '''takes HTTP Content-Type header and returns (content type, charset)
-
-        If no charset is specified, returns (content type, '')
-        If no content type is specified, returns ('', '')
-        Both return parameters are guaranteed to be lowercase strings
-        '''
-        content_type = content_type or ''
-        content_type, params = cgi.parse_header(content_type)
-        return content_type, params.get('charset', '').replace("'", '')
-
-    sniffed_xml_encoding = ''
-    xml_encoding = ''
-    true_encoding = ''
-    http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
-    # Must sniff for non-ASCII-compatible character encodings before
-    # searching for XML declaration.  This heuristic is defined in
-    # section F of the XML specification:
-    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
-    try:
-        if xml_data[:4] == '\x4c\x6f\xa7\x94':
-            # EBCDIC
-            xml_data = _ebcdic_to_ascii(xml_data)
-        elif xml_data[:4] == '\x00\x3c\x00\x3f':
-            # UTF-16BE
-            sniffed_xml_encoding = 'utf-16be'
-            xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
-            # UTF-16BE with BOM
-            sniffed_xml_encoding = 'utf-16be'
-            xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-        elif xml_data[:4] == '\x3c\x00\x3f\x00':
-            # UTF-16LE
-            sniffed_xml_encoding = 'utf-16le'
-            xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
-            # UTF-16LE with BOM
-            sniffed_xml_encoding = 'utf-16le'
-            xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-        elif xml_data[:4] == '\x00\x00\x00\x3c':
-            # UTF-32BE
-            sniffed_xml_encoding = 'utf-32be'
-            xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-        elif xml_data[:4] == '\x3c\x00\x00\x00':
-            # UTF-32LE
-            sniffed_xml_encoding = 'utf-32le'
-            xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-        elif xml_data[:4] == '\x00\x00\xfe\xff':
-            # UTF-32BE with BOM
-            sniffed_xml_encoding = 'utf-32be'
-            xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-        elif xml_data[:4] == '\xff\xfe\x00\x00':
-            # UTF-32LE with BOM
-            sniffed_xml_encoding = 'utf-32le'
-            xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-        elif xml_data[:3] == '\xef\xbb\xbf':
-            # UTF-8 with BOM
-            sniffed_xml_encoding = 'utf-8'
-            xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
-        else:
-            # ASCII-compatible
-            pass
-        xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
-    except:
-        xml_encoding_match = None
-    if xml_encoding_match:
-        xml_encoding = xml_encoding_match.groups()[0].lower()
-        if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
-            xml_encoding = sniffed_xml_encoding
-    acceptable_content_type = 0
-    application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
-    text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
-    if (http_content_type in application_content_types) or \
-       (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
-        acceptable_content_type = 1
-        true_encoding = http_encoding or xml_encoding or 'utf-8'
-    elif (http_content_type in text_content_types) or \
-         (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
-        acceptable_content_type = 1
-        true_encoding = http_encoding or 'us-ascii'
-    elif http_content_type.startswith('text/'):
-        true_encoding = http_encoding or 'us-ascii'
-    elif http_headers and (not http_headers.has_key('content-type')):
-        true_encoding = xml_encoding or 'iso-8859-1'
-    else:
-        true_encoding = xml_encoding or 'utf-8'
-    # some feeds claim to be gb2312 but are actually gb18030.
-    # apparently MSIE and Firefox both do the following switch:
-    if true_encoding.lower() == 'gb2312':
-        true_encoding = 'gb18030'
-    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
-    
-def _toUTF8(data, encoding):
-    '''Changes an XML data stream on the fly to specify a new encoding
-
-    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
-    encoding is a string recognized by encodings.aliases
-    '''
-    if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
-    # strip Byte Order Mark (if present)
-    if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-16be':
-                sys.stderr.write('trying utf-16be instead\n')
-        encoding = 'utf-16be'
-        data = data[2:]
-    elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-16le':
-                sys.stderr.write('trying utf-16le instead\n')
-        encoding = 'utf-16le'
-        data = data[2:]
-    elif data[:3] == '\xef\xbb\xbf':
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-8':
-                sys.stderr.write('trying utf-8 instead\n')
-        encoding = 'utf-8'
-        data = data[3:]
-    elif data[:4] == '\x00\x00\xfe\xff':
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-32be':
-                sys.stderr.write('trying utf-32be instead\n')
-        encoding = 'utf-32be'
-        data = data[4:]
-    elif data[:4] == '\xff\xfe\x00\x00':
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-32le':
-                sys.stderr.write('trying utf-32le instead\n')
-        encoding = 'utf-32le'
-        data = data[4:]
-    newdata = unicode(data, encoding)
-    if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
-    declmatch = re.compile('^<\?xml[^>]*?>')
-    newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
-    if declmatch.search(newdata):
-        newdata = declmatch.sub(newdecl, newdata)
-    else:
-        newdata = newdecl + u'\n' + newdata
-    return newdata.encode('utf-8')
-
-def _stripDoctype(data):
-    '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
-
-    rss_version may be 'rss091n' or None
-    stripped_data is the same XML document, minus the DOCTYPE
-    '''
-    start = re.search('<\w',data)
-    start = start and start.start() or -1
-    head,data = data[:start+1], data[start+1:]
-    
-    entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
-    entity_results=entity_pattern.findall(head)
-    head = entity_pattern.sub('', head)
-    doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
-    doctype_results = doctype_pattern.findall(head)
-    doctype = doctype_results and doctype_results[0] or ''
-    if doctype.lower().count('netscape'):
-        version = 'rss091n'
-    else:
-        version = None
-
-    # only allow in 'safe' inline entity definitions
-    replacement=''
-    if len(doctype_results)==1 and entity_results:
-       safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
-       safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
-       if safe_entities:
-           replacement='<!DOCTYPE feed [\n  <!ENTITY %s>\n]>' % '>\n  <!ENTITY '.join(safe_entities)
-    data = doctype_pattern.sub(replacement, head) + data
-
-    return version, data, dict(replacement and safe_pattern.findall(replacement))
-    
-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
-    '''Parse a feed from a URL, file, stream, or string'''
-    result = FeedParserDict()
-    result['feed'] = FeedParserDict()
-    result['entries'] = []
-    if _XML_AVAILABLE:
-        result['bozo'] = 0
-    if type(handlers) == types.InstanceType:
-        handlers = [handlers]
-    try:
-        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
-        data = f.read()
-    except Exception, e:
-        result['bozo'] = 1
-        result['bozo_exception'] = e
-        data = None
-        f = None
-
-    # if feed is gzip-compressed, decompress it
-    if f and data and hasattr(f, 'headers'):
-        if gzip and f.headers.get('content-encoding', '') == 'gzip':
-            try:
-                data = gzip.GzipFile(fileobj=_StringIO(data)).read()
-            except Exception, e:
-                # Some feeds claim to be gzipped but they're not, so
-                # we get garbage.  Ideally, we should re-request the
-                # feed without the 'Accept-encoding: gzip' header,
-                # but we don't.
-                result['bozo'] = 1
-                result['bozo_exception'] = e
-                data = ''
-        elif zlib and f.headers.get('content-encoding', '') == 'deflate':
-            try:
-                data = zlib.decompress(data, -zlib.MAX_WBITS)
-            except Exception, e:
-                result['bozo'] = 1
-                result['bozo_exception'] = e
-                data = ''
-
-    # save HTTP headers
-    if hasattr(f, 'info'):
-        info = f.info()
-        etag = info.getheader('ETag')
-        if etag:
-            result['etag'] = etag
-        last_modified = info.getheader('Last-Modified')
-        if last_modified:
-            result['modified'] = _parse_date(last_modified)
-    if hasattr(f, 'url'):
-        result['href'] = f.url
-        result['status'] = 200
-    if hasattr(f, 'status'):
-        result['status'] = f.status
-    if hasattr(f, 'headers'):
-        result['headers'] = f.headers.dict
-    if hasattr(f, 'close'):
-        f.close()
-
-    # there are four encodings to keep track of:
-    # - http_encoding is the encoding declared in the Content-Type HTTP header
-    # - xml_encoding is the encoding declared in the <?xml declaration
-    # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
-    # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
-    http_headers = result.get('headers', {})
-    result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
-        _getCharacterEncoding(http_headers, data)
-    if http_headers and (not acceptable_content_type):
-        if http_headers.has_key('content-type'):
-            bozo_message = '%s is not an XML media type' % http_headers['content-type']
-        else:
-            bozo_message = 'no Content-type specified'
-        result['bozo'] = 1
-        result['bozo_exception'] = NonXMLContentType(bozo_message)
-
-    if data is not None:
-        result['version'], data, entities = _stripDoctype(data)
-
-    baseuri = http_headers.get('content-location', result.get('href'))
-    baselang = http_headers.get('content-language', None)
-
-    # if server sent 304, we're done
-    if result.get('status', 0) == 304:
-        result['version'] = ''
-        result['debug_message'] = 'The feed has not changed since you last checked, ' + \
-            'so the server sent no data.  This is a feature, not a bug!'
-        return result
-
-    # if there was a problem downloading, we're done
-    if data is None:
-        return result
-
-    # determine character encoding
-    use_strict_parser = 0
-    known_encoding = 0
-    tried_encodings = []
-    # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
-    for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
-        if not proposed_encoding: continue
-        if proposed_encoding in tried_encodings: continue
-        tried_encodings.append(proposed_encoding)
-        try:
-            data = _toUTF8(data, proposed_encoding)
-            known_encoding = use_strict_parser = 1
-            break
-        except:
-            pass
-    # if no luck and we have auto-detection library, try that
-    if (not known_encoding) and chardet:
-        try:
-            proposed_encoding = chardet.detect(data)['encoding']
-            if proposed_encoding and (proposed_encoding not in tried_encodings):
-                tried_encodings.append(proposed_encoding)
-                data = _toUTF8(data, proposed_encoding)
-                known_encoding = use_strict_parser = 1
-        except:
-            pass
-    # if still no luck and we haven't tried utf-8 yet, try that
-    if (not known_encoding) and ('utf-8' not in tried_encodings):
-        try:
-            proposed_encoding = 'utf-8'
-            tried_encodings.append(proposed_encoding)
-            data = _toUTF8(data, proposed_encoding)
-            known_encoding = use_strict_parser = 1
-        except:
-            pass
-    # if still no luck and we haven't tried windows-1252 yet, try that
-    if (not known_encoding) and ('windows-1252' not in tried_encodings):
-        try:
-            proposed_encoding = 'windows-1252'
-            tried_encodings.append(proposed_encoding)
-            data = _toUTF8(data, proposed_encoding)
-            known_encoding = use_strict_parser = 1
-        except:
-            pass
-    # if still no luck and we haven't tried iso-8859-2 yet, try that.
-    if (not known_encoding) and ('iso-8859-2' not in tried_encodings):
-        try:
-            proposed_encoding = 'iso-8859-2'
-            tried_encodings.append(proposed_encoding)
-            data = _toUTF8(data, proposed_encoding)
-            known_encoding = use_strict_parser = 1
-        except:
-            pass
-    # if still no luck, give up
-    if not known_encoding:
-        result['bozo'] = 1
-        result['bozo_exception'] = CharacterEncodingUnknown( \
-            'document encoding unknown, I tried ' + \
-            '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
-            (result['encoding'], xml_encoding))
-        result['encoding'] = ''
-    elif proposed_encoding != result['encoding']:
-        result['bozo'] = 1
-        result['bozo_exception'] = CharacterEncodingOverride( \
-            'documented declared as %s, but parsed as %s' % \
-            (result['encoding'], proposed_encoding))
-        result['encoding'] = proposed_encoding
-
-    if not _XML_AVAILABLE:
-        use_strict_parser = 0
-    if use_strict_parser:
-        # initialize the SAX parser
-        feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
-        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
-        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
-        saxparser.setContentHandler(feedparser)
-        saxparser.setErrorHandler(feedparser)
-        source = xml.sax.xmlreader.InputSource()
-        source.setByteStream(_StringIO(data))
-        if hasattr(saxparser, '_ns_stack'):
-            # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
-            # PyXML doesn't have this problem, and it doesn't have _ns_stack either
-            saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
-        try:
-            saxparser.parse(source)
-        except Exception, e:
-            if _debug:
-                import traceback
-                traceback.print_stack()
-                traceback.print_exc()
-                sys.stderr.write('xml parsing failed\n')
-            result['bozo'] = 1
-            result['bozo_exception'] = feedparser.exc or e
-            use_strict_parser = 0
-    if not use_strict_parser:
-        feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '', entities)
-        feedparser.feed(data)
-    result['feed'] = feedparser.feeddata
-    result['entries'] = feedparser.entries
-    result['version'] = result['version'] or feedparser.version
-    result['namespaces'] = feedparser.namespacesInUse
-    return result
-
-class Serializer:
-    def __init__(self, results):
-        self.results = results
-
-class TextSerializer(Serializer):
-    def write(self, stream=sys.stdout):
-        self._writer(stream, self.results, '')
-
-    def _writer(self, stream, node, prefix):
-        if not node: return
-        if hasattr(node, 'keys'):
-            keys = node.keys()
-            keys.sort()
-            for k in keys:
-                if k in ('description', 'link'): continue
-                if node.has_key(k + '_detail'): continue
-                if node.has_key(k + '_parsed'): continue
-                self._writer(stream, node[k], prefix + k + '.')
-        elif type(node) == types.ListType:
-            index = 0
-            for n in node:
-                self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].')
-                index += 1
-        else:
-            try:
-                s = str(node).encode('utf-8')
-                s = s.replace('\\', '\\\\')
-                s = s.replace('\r', '')
-                s = s.replace('\n', r'\n')
-                stream.write(prefix[:-1])
-                stream.write('=')
-                stream.write(s)
-                stream.write('\n')
-            except:
-                pass
-        
-class PprintSerializer(Serializer):
-    def write(self, stream=sys.stdout):
-        if self.results.has_key('href'):
-            stream.write(self.results['href'] + '\n\n')
-        from pprint import pprint
-        pprint(self.results, stream)
-        stream.write('\n')
-        
-if __name__ == '__main__':
-    try:
-        from optparse import OptionParser
-    except:
-        OptionParser = None
-
-    if OptionParser:
-        optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-")
-        optionParser.set_defaults(format="pprint")
-        optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs")
-        optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs")
-        optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs")
-        optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
-        optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)")
-        optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr")
-        (options, urls) = optionParser.parse_args()
-        if options.verbose:
-            _debug = 1
-        if not urls:
-            optionParser.print_help()
-            sys.exit(0)
-    else:
-        if not sys.argv[1:]:
-            print __doc__
-            sys.exit(0)
-        class _Options:
-            etag = modified = agent = referrer = None
-            format = 'pprint'
-        options = _Options()
-        urls = sys.argv[1:]
-
-    zopeCompatibilityHack()
-
-    serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer)
-    for url in urls:
-        results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer)
-        serializer(results).write(sys.stdout)
diff --git a/planet/vendor/html5lib/__init__.py b/planet/vendor/html5lib/__init__.py
index ae64f14..a67a652 100644
--- a/planet/vendor/html5lib/__init__.py
+++ b/planet/vendor/html5lib/__init__.py
@@ -1,4 +1,4 @@
-""" 
+"""
 HTML parsing library based on the WHATWG "HTML5"
 specification. The parser is designed to be compatible with existing
 HTML found in the wild and implements well-defined error recovery that
@@ -8,10 +8,16 @@
 
 import html5lib
 f = open("my_document.html")
-tree = html5lib.parse(f) 
+tree = html5lib.parse(f)
 """
-__version__ = "%(version)s"
-from html5parser import HTMLParser, parse, parseFragment
-from treebuilders import getTreeBuilder
-from treewalkers import getTreeWalker
-from serializer import serialize
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .html5parser import HTMLParser, parse, parseFragment
+from .treebuilders import getTreeBuilder
+from .treewalkers import getTreeWalker
+from .serializer import serialize
+
+__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
+           "getTreeWalker", "serialize"]
+__version__ = "0.9999-dev"
diff --git a/planet/vendor/html5lib/constants.py b/planet/vendor/html5lib/constants.py
index f9521c8..e708984 100644
--- a/planet/vendor/html5lib/constants.py
+++ b/planet/vendor/html5lib/constants.py
@@ -1,287 +1,305 @@
-import string, gettext
-_ = gettext.gettext
+from __future__ import absolute_import, division, unicode_literals
 
-try:
-    frozenset
-except NameError:
-    # Import from the sets module for python 2.3
-    from sets import Set as set
-    from sets import ImmutableSet as frozenset
+import string
+import gettext
+_ = gettext.gettext
 
 EOF = None
 
 E = {
-    "null-character": 
-       _(u"Null character in input stream, replaced with U+FFFD."),
-    "invalid-character": 
-       _(u"Invalid codepoint in stream."),
+    "null-character":
+        _("Null character in input stream, replaced with U+FFFD."),
+    "invalid-codepoint":
+        _("Invalid codepoint in stream."),
     "incorrectly-placed-solidus":
-       _(u"Solidus (/) incorrectly placed in tag."),
+        _("Solidus (/) incorrectly placed in tag."),
     "incorrect-cr-newline-entity":
-       _(u"Incorrect CR newline entity, replaced with LF."),
+        _("Incorrect CR newline entity, replaced with LF."),
     "illegal-windows-1252-entity":
-       _(u"Entity used with illegal number (windows-1252 reference)."),
+        _("Entity used with illegal number (windows-1252 reference)."),
     "cant-convert-numeric-entity":
-       _(u"Numeric entity couldn't be converted to character "
-         u"(codepoint U+%(charAsInt)08x)."),
+        _("Numeric entity couldn't be converted to character "
+          "(codepoint U+%(charAsInt)08x)."),
     "illegal-codepoint-for-numeric-entity":
-       _(u"Numeric entity represents an illegal codepoint: "
-         u"U+%(charAsInt)08x."),
+        _("Numeric entity represents an illegal codepoint: "
+          "U+%(charAsInt)08x."),
     "numeric-entity-without-semicolon":
-       _(u"Numeric entity didn't end with ';'."),
+        _("Numeric entity didn't end with ';'."),
     "expected-numeric-entity-but-got-eof":
-       _(u"Numeric entity expected. Got end of file instead."),
+        _("Numeric entity expected. Got end of file instead."),
     "expected-numeric-entity":
-       _(u"Numeric entity expected but none found."),
+        _("Numeric entity expected but none found."),
     "named-entity-without-semicolon":
-       _(u"Named entity didn't end with ';'."),
+        _("Named entity didn't end with ';'."),
     "expected-named-entity":
-       _(u"Named entity expected. Got none."),
+        _("Named entity expected. Got none."),
     "attributes-in-end-tag":
-       _(u"End tag contains unexpected attributes."),
+        _("End tag contains unexpected attributes."),
+    'self-closing-flag-on-end-tag':
+        _("End tag contains unexpected self-closing flag."),
     "expected-tag-name-but-got-right-bracket":
-       _(u"Expected tag name. Got '>' instead."),
+        _("Expected tag name. Got '>' instead."),
     "expected-tag-name-but-got-question-mark":
-       _(u"Expected tag name. Got '?' instead. (HTML doesn't "
-         u"support processing instructions.)"),
+        _("Expected tag name. Got '?' instead. (HTML doesn't "
+          "support processing instructions.)"),
     "expected-tag-name":
-       _(u"Expected tag name. Got something else instead"),
+        _("Expected tag name. Got something else instead"),
     "expected-closing-tag-but-got-right-bracket":
-       _(u"Expected closing tag. Got '>' instead. Ignoring '</>'."),
+        _("Expected closing tag. Got '>' instead. Ignoring '</>'."),
     "expected-closing-tag-but-got-eof":
-       _(u"Expected closing tag. Unexpected end of file."),
+        _("Expected closing tag. Unexpected end of file."),
     "expected-closing-tag-but-got-char":
-       _(u"Expected closing tag. Unexpected character '%(data)s' found."),
+        _("Expected closing tag. Unexpected character '%(data)s' found."),
     "eof-in-tag-name":
-       _(u"Unexpected end of file in the tag name."),
+        _("Unexpected end of file in the tag name."),
     "expected-attribute-name-but-got-eof":
-       _(u"Unexpected end of file. Expected attribute name instead."),
+        _("Unexpected end of file. Expected attribute name instead."),
     "eof-in-attribute-name":
-       _(u"Unexpected end of file in attribute name."),
+        _("Unexpected end of file in attribute name."),
     "invalid-character-in-attribute-name":
-        _(u"Invalid chracter in attribute name"),
+        _("Invalid character in attribute name"),
     "duplicate-attribute":
-       _(u"Dropped duplicate attribute on tag."),
+        _("Dropped duplicate attribute on tag."),
     "expected-end-of-tag-name-but-got-eof":
-       _(u"Unexpected end of file. Expected = or end of tag."),
+        _("Unexpected end of file. Expected = or end of tag."),
     "expected-attribute-value-but-got-eof":
-       _(u"Unexpected end of file. Expected attribute value."),
+        _("Unexpected end of file. Expected attribute value."),
     "expected-attribute-value-but-got-right-bracket":
-       _(u"Expected attribute value. Got '>' instead."),
+        _("Expected attribute value. Got '>' instead."),
+    'equals-in-unquoted-attribute-value':
+        _("Unexpected = in unquoted attribute"),
+    'unexpected-character-in-unquoted-attribute-value':
+        _("Unexpected character in unquoted attribute"),
+    "invalid-character-after-attribute-name":
+        _("Unexpected character after attribute name."),
+    "unexpected-character-after-attribute-value":
+        _("Unexpected character after attribute value."),
     "eof-in-attribute-value-double-quote":
-       _(u"Unexpected end of file in attribute value (\")."),
+        _("Unexpected end of file in attribute value (\")."),
     "eof-in-attribute-value-single-quote":
-       _(u"Unexpected end of file in attribute value (')."),
+        _("Unexpected end of file in attribute value (')."),
     "eof-in-attribute-value-no-quotes":
-       _(u"Unexpected end of file in attribute value."),
+        _("Unexpected end of file in attribute value."),
     "unexpected-EOF-after-solidus-in-tag":
-        _(u"Unexpected end of file in tag. Expected >"),
-    "unexpected-character-after-soldius-in-tag":
-        _(u"Unexpected character after / in tag. Expected >"),
+        _("Unexpected end of file in tag. Expected >"),
+    "unexpected-character-after-solidus-in-tag":
+        _("Unexpected character after / in tag. Expected >"),
     "expected-dashes-or-doctype":
-       _(u"Expected '--' or 'DOCTYPE'. Not found."),
+        _("Expected '--' or 'DOCTYPE'. Not found."),
+    "unexpected-bang-after-double-dash-in-comment":
+        _("Unexpected ! after -- in comment"),
+    "unexpected-space-after-double-dash-in-comment":
+        _("Unexpected space after -- in comment"),
     "incorrect-comment":
-       _(u"Incorrect comment."),
+        _("Incorrect comment."),
     "eof-in-comment":
-       _(u"Unexpected end of file in comment."),
+        _("Unexpected end of file in comment."),
     "eof-in-comment-end-dash":
-       _(u"Unexpected end of file in comment (-)"),
+        _("Unexpected end of file in comment (-)"),
     "unexpected-dash-after-double-dash-in-comment":
-       _(u"Unexpected '-' after '--' found in comment."),
+        _("Unexpected '-' after '--' found in comment."),
     "eof-in-comment-double-dash":
-       _(u"Unexpected end of file in comment (--)."),
+        _("Unexpected end of file in comment (--)."),
+    "eof-in-comment-end-space-state":
+        _("Unexpected end of file in comment."),
+    "eof-in-comment-end-bang-state":
+        _("Unexpected end of file in comment."),
     "unexpected-char-in-comment":
-       _(u"Unexpected character in comment found."),
+        _("Unexpected character in comment found."),
     "need-space-after-doctype":
-       _(u"No space after literal string 'DOCTYPE'."),
+        _("No space after literal string 'DOCTYPE'."),
     "expected-doctype-name-but-got-right-bracket":
-       _(u"Unexpected > character. Expected DOCTYPE name."),
+        _("Unexpected > character. Expected DOCTYPE name."),
     "expected-doctype-name-but-got-eof":
-       _(u"Unexpected end of file. Expected DOCTYPE name."),
+        _("Unexpected end of file. Expected DOCTYPE name."),
     "eof-in-doctype-name":
-       _(u"Unexpected end of file in DOCTYPE name."),
+        _("Unexpected end of file in DOCTYPE name."),
     "eof-in-doctype":
-       _(u"Unexpected end of file in DOCTYPE."),
+        _("Unexpected end of file in DOCTYPE."),
     "expected-space-or-right-bracket-in-doctype":
-       _(u"Expected space or '>'. Got '%(data)s'"),
+        _("Expected space or '>'. Got '%(data)s'"),
     "unexpected-end-of-doctype":
-       _(u"Unexpected end of DOCTYPE."),
+        _("Unexpected end of DOCTYPE."),
     "unexpected-char-in-doctype":
-       _(u"Unexpected character in DOCTYPE."),
+        _("Unexpected character in DOCTYPE."),
     "eof-in-innerhtml":
-       _(u"XXX innerHTML EOF"),
+        _("XXX innerHTML EOF"),
     "unexpected-doctype":
-       _(u"Unexpected DOCTYPE. Ignored."),
+        _("Unexpected DOCTYPE. Ignored."),
     "non-html-root":
-       _(u"html needs to be the first start tag."),
+        _("html needs to be the first start tag."),
     "expected-doctype-but-got-eof":
-       _(u"Unexpected End of file. Expected DOCTYPE."),
+        _("Unexpected End of file. Expected DOCTYPE."),
     "unknown-doctype":
-       _(u"Erroneous DOCTYPE."),
+        _("Erroneous DOCTYPE."),
     "expected-doctype-but-got-chars":
-       _(u"Unexpected non-space characters. Expected DOCTYPE."),
+        _("Unexpected non-space characters. Expected DOCTYPE."),
     "expected-doctype-but-got-start-tag":
-       _(u"Unexpected start tag (%(name)s). Expected DOCTYPE."),
+        _("Unexpected start tag (%(name)s). Expected DOCTYPE."),
     "expected-doctype-but-got-end-tag":
-       _(u"Unexpected end tag (%(name)s). Expected DOCTYPE."),
+        _("Unexpected end tag (%(name)s). Expected DOCTYPE."),
     "end-tag-after-implied-root":
-       _(u"Unexpected end tag (%(name)s) after the (implied) root element."),
+        _("Unexpected end tag (%(name)s) after the (implied) root element."),
     "expected-named-closing-tag-but-got-eof":
-       _(u"Unexpected end of file. Expected end tag (%(name)s)."),
+        _("Unexpected end of file. Expected end tag (%(name)s)."),
     "two-heads-are-not-better-than-one":
-       _(u"Unexpected start tag head in existing head. Ignored."),
+        _("Unexpected start tag head in existing head. Ignored."),
     "unexpected-end-tag":
-       _(u"Unexpected end tag (%(name)s). Ignored."),
+        _("Unexpected end tag (%(name)s). Ignored."),
     "unexpected-start-tag-out-of-my-head":
-       _(u"Unexpected start tag (%(name)s) that can be in head. Moved."),
+        _("Unexpected start tag (%(name)s) that can be in head. Moved."),
     "unexpected-start-tag":
-       _(u"Unexpected start tag (%(name)s)."),
+        _("Unexpected start tag (%(name)s)."),
     "missing-end-tag":
-       _(u"Missing end tag (%(name)s)."),
+        _("Missing end tag (%(name)s)."),
     "missing-end-tags":
-       _(u"Missing end tags (%(name)s)."),
+        _("Missing end tags (%(name)s)."),
     "unexpected-start-tag-implies-end-tag":
-       _(u"Unexpected start tag (%(startName)s) "
-         u"implies end tag (%(endName)s)."),
+        _("Unexpected start tag (%(startName)s) "
+          "implies end tag (%(endName)s)."),
     "unexpected-start-tag-treated-as":
-       _(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
+        _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
     "deprecated-tag":
-       _(u"Unexpected start tag %(name)s. Don't use it!"),
+        _("Unexpected start tag %(name)s. Don't use it!"),
     "unexpected-start-tag-ignored":
-       _(u"Unexpected start tag %(name)s. Ignored."),
+        _("Unexpected start tag %(name)s. Ignored."),
     "expected-one-end-tag-but-got-another":
-       _(u"Unexpected end tag (%(gotName)s). "
-         u"Missing end tag (%(expectedName)s)."),
+        _("Unexpected end tag (%(gotName)s). "
+          "Missing end tag (%(expectedName)s)."),
     "end-tag-too-early":
-       _(u"End tag (%(name)s) seen too early. Expected other end tag."),
+        _("End tag (%(name)s) seen too early. Expected other end tag."),
     "end-tag-too-early-named":
-       _(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
+        _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
     "end-tag-too-early-ignored":
-       _(u"End tag (%(name)s) seen too early. Ignored."),
+        _("End tag (%(name)s) seen too early. Ignored."),
     "adoption-agency-1.1":
-       _(u"End tag (%(name)s) violates step 1, "
-         u"paragraph 1 of the adoption agency algorithm."),
+        _("End tag (%(name)s) violates step 1, "
+          "paragraph 1 of the adoption agency algorithm."),
     "adoption-agency-1.2":
-       _(u"End tag (%(name)s) violates step 1, "
-         u"paragraph 2 of the adoption agency algorithm."),
+        _("End tag (%(name)s) violates step 1, "
+          "paragraph 2 of the adoption agency algorithm."),
     "adoption-agency-1.3":
-       _(u"End tag (%(name)s) violates step 1, "
-         u"paragraph 3 of the adoption agency algorithm."),
+        _("End tag (%(name)s) violates step 1, "
+          "paragraph 3 of the adoption agency algorithm."),
+    "adoption-agency-4.4":
+        _("End tag (%(name)s) violates step 4, "
+          "paragraph 4 of the adoption agency algorithm."),
     "unexpected-end-tag-treated-as":
-       _(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
+        _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
     "no-end-tag":
-       _(u"This element (%(name)s) has no end tag."),
+        _("This element (%(name)s) has no end tag."),
     "unexpected-implied-end-tag-in-table":
-       _(u"Unexpected implied end tag (%(name)s) in the table phase."),
+        _("Unexpected implied end tag (%(name)s) in the table phase."),
     "unexpected-implied-end-tag-in-table-body":
-       _(u"Unexpected implied end tag (%(name)s) in the table body phase."),
+        _("Unexpected implied end tag (%(name)s) in the table body phase."),
     "unexpected-char-implies-table-voodoo":
-       _(u"Unexpected non-space characters in "
-         u"table context caused voodoo mode."),
+        _("Unexpected non-space characters in "
+          "table context caused voodoo mode."),
     "unexpected-hidden-input-in-table":
-       _(u"Unexpected input with type hidden in table context."),
+        _("Unexpected input with type hidden in table context."),
     "unexpected-form-in-table":
-       _(u"Unexpected form in table context."),
+        _("Unexpected form in table context."),
     "unexpected-start-tag-implies-table-voodoo":
-       _(u"Unexpected start tag (%(name)s) in "
-         u"table context caused voodoo mode."),
+        _("Unexpected start tag (%(name)s) in "
+          "table context caused voodoo mode."),
     "unexpected-end-tag-implies-table-voodoo":
-       _(u"Unexpected end tag (%(name)s) in "
-         u"table context caused voodoo mode."),
+        _("Unexpected end tag (%(name)s) in "
+          "table context caused voodoo mode."),
     "unexpected-cell-in-table-body":
-       _(u"Unexpected table cell start tag (%(name)s) "
-         u"in the table body phase."),
+        _("Unexpected table cell start tag (%(name)s) "
+          "in the table body phase."),
     "unexpected-cell-end-tag":
-       _(u"Got table cell end tag (%(name)s) "
-         u"while required end tags are missing."),
+        _("Got table cell end tag (%(name)s) "
+          "while required end tags are missing."),
     "unexpected-end-tag-in-table-body":
-       _(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."),
+        _("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
     "unexpected-implied-end-tag-in-table-row":
-       _(u"Unexpected implied end tag (%(name)s) in the table row phase."),
+        _("Unexpected implied end tag (%(name)s) in the table row phase."),
     "unexpected-end-tag-in-table-row":
-       _(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
+        _("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
     "unexpected-select-in-select":
-       _(u"Unexpected select start tag in the select phase "
-         u"treated as select end tag."),
+        _("Unexpected select start tag in the select phase "
+          "treated as select end tag."),
     "unexpected-input-in-select":
-       _(u"Unexpected input start tag in the select phase."),
+        _("Unexpected input start tag in the select phase."),
     "unexpected-start-tag-in-select":
-       _(u"Unexpected start tag token (%(name)s in the select phase. "
-         u"Ignored."),
+        _("Unexpected start tag token (%(name)s in the select phase. "
+          "Ignored."),
     "unexpected-end-tag-in-select":
-       _(u"Unexpected end tag (%(name)s) in the select phase. Ignored."),
+        _("Unexpected end tag (%(name)s) in the select phase. Ignored."),
     "unexpected-table-element-start-tag-in-select-in-table":
-       _(u"Unexpected table element start tag (%(name)s) in the select in table phase."),
+        _("Unexpected table element start tag (%(name)s) in the select in table phase."),
     "unexpected-table-element-end-tag-in-select-in-table":
-       _(u"Unexpected table element end tag (%(name)s) in the select in table phase."),
+        _("Unexpected table element end tag (%(name)s) in the select in table phase."),
     "unexpected-char-after-body":
-       _(u"Unexpected non-space characters in the after body phase."),
+        _("Unexpected non-space characters in the after body phase."),
     "unexpected-start-tag-after-body":
-       _(u"Unexpected start tag token (%(name)s)"
-         u" in the after body phase."),
+        _("Unexpected start tag token (%(name)s)"
+          " in the after body phase."),
     "unexpected-end-tag-after-body":
-       _(u"Unexpected end tag token (%(name)s)"
-         u" in the after body phase."),
+        _("Unexpected end tag token (%(name)s)"
+          " in the after body phase."),
     "unexpected-char-in-frameset":
-       _(u"Unepxected characters in the frameset phase. Characters ignored."),
+        _("Unexpected characters in the frameset phase. Characters ignored."),
     "unexpected-start-tag-in-frameset":
-       _(u"Unexpected start tag token (%(name)s)"
-         u" in the frameset phase. Ignored."),
+        _("Unexpected start tag token (%(name)s)"
+          " in the frameset phase. Ignored."),
     "unexpected-frameset-in-frameset-innerhtml":
-       _(u"Unexpected end tag token (frameset) "
-         u"in the frameset phase (innerHTML)."),
+        _("Unexpected end tag token (frameset) "
+          "in the frameset phase (innerHTML)."),
     "unexpected-end-tag-in-frameset":
-       _(u"Unexpected end tag token (%(name)s)"
-         u" in the frameset phase. Ignored."),
+        _("Unexpected end tag token (%(name)s)"
+          " in the frameset phase. Ignored."),
     "unexpected-char-after-frameset":
-       _(u"Unexpected non-space characters in the "
-         u"after frameset phase. Ignored."),
+        _("Unexpected non-space characters in the "
+          "after frameset phase. Ignored."),
     "unexpected-start-tag-after-frameset":
-       _(u"Unexpected start tag (%(name)s)"
-         u" in the after frameset phase. Ignored."),
+        _("Unexpected start tag (%(name)s)"
+          " in the after frameset phase. Ignored."),
     "unexpected-end-tag-after-frameset":
-       _(u"Unexpected end tag (%(name)s)"
-         u" in the after frameset phase. Ignored."),
+        _("Unexpected end tag (%(name)s)"
+          " in the after frameset phase. Ignored."),
     "unexpected-end-tag-after-body-innerhtml":
-       _(u"Unexpected end tag after body(innerHtml)"),
+        _("Unexpected end tag after body(innerHtml)"),
     "expected-eof-but-got-char":
-       _(u"Unexpected non-space characters. Expected end of file."),
+        _("Unexpected non-space characters. Expected end of file."),
     "expected-eof-but-got-start-tag":
-       _(u"Unexpected start tag (%(name)s)"
-         u". Expected end of file."),
+        _("Unexpected start tag (%(name)s)"
+          ". Expected end of file."),
     "expected-eof-but-got-end-tag":
-       _(u"Unexpected end tag (%(name)s)"
-         u". Expected end of file."),
+        _("Unexpected end tag (%(name)s)"
+          ". Expected end of file."),
     "eof-in-table":
-       _(u"Unexpected end of file. Expected table content."),
+        _("Unexpected end of file. Expected table content."),
     "eof-in-select":
-       _(u"Unexpected end of file. Expected select content."),
+        _("Unexpected end of file. Expected select content."),
     "eof-in-frameset":
-       _(u"Unexpected end of file. Expected frameset content."),
+        _("Unexpected end of file. Expected frameset content."),
     "eof-in-script-in-script":
-       _(u"Unexpected end of file. Expected script content."),
+        _("Unexpected end of file. Expected script content."),
+    "eof-in-foreign-lands":
+        _("Unexpected end of file. Expected foreign content"),
     "non-void-element-with-trailing-solidus":
-       _(u"Trailing solidus not allowed on element %(name)s"),
+        _("Trailing solidus not allowed on element %(name)s"),
     "unexpected-html-element-in-foreign-content":
-       _(u"Element %(name)s not allowed in a non-html context"),
+        _("Element %(name)s not allowed in a non-html context"),
     "unexpected-end-tag-before-html":
-        _(u"Unexpected end tag (%(name)s) before html."),
+        _("Unexpected end tag (%(name)s) before html."),
     "XXX-undefined-error":
-        (u"Undefined error (this sucks and should be fixed)"),
+        _("Undefined error (this sucks and should be fixed)"),
 }
 
 namespaces = {
-    "html":"http://www.w3.org/1999/xhtml",
-    "mathml":"http://www.w3.org/1998/Math/MathML",
-    "svg":"http://www.w3.org/2000/svg",
-    "xlink":"http://www.w3.org/1999/xlink",
-    "xml":"http://www.w3.org/XML/1998/namespace",
-    "xmlns":"http://www.w3.org/2000/xmlns/"
+    "html": "http://www.w3.org/1999/xhtml",
+    "mathml": "http://www.w3.org/1998/Math/MathML",
+    "svg": "http://www.w3.org/2000/svg",
+    "xlink": "http://www.w3.org/1999/xlink",
+    "xml": "http://www.w3.org/XML/1998/namespace",
+    "xmlns": "http://www.w3.org/2000/xmlns/"
 }
 
 scopingElements = frozenset((
     (namespaces["html"], "applet"),
-    (namespaces["html"], "button"),
     (namespaces["html"], "caption"),
     (namespaces["html"], "html"),
     (namespaces["html"], "marquee"),
@@ -289,7 +307,15 @@
     (namespaces["html"], "table"),
     (namespaces["html"], "td"),
     (namespaces["html"], "th"),
-    (namespaces["svg"], "foreignObject")
+    (namespaces["mathml"], "mi"),
+    (namespaces["mathml"], "mo"),
+    (namespaces["mathml"], "mn"),
+    (namespaces["mathml"], "ms"),
+    (namespaces["mathml"], "mtext"),
+    (namespaces["mathml"], "annotation-xml"),
+    (namespaces["svg"], "foreignObject"),
+    (namespaces["svg"], "desc"),
+    (namespaces["svg"], "title"),
 ))
 
 formattingElements = frozenset((
@@ -311,6 +337,7 @@
 
 specialElements = frozenset((
     (namespaces["html"], "address"),
+    (namespaces["html"], "applet"),
     (namespaces["html"], "area"),
     (namespaces["html"], "article"),
     (namespaces["html"], "aside"),
@@ -320,20 +347,19 @@
     (namespaces["html"], "blockquote"),
     (namespaces["html"], "body"),
     (namespaces["html"], "br"),
+    (namespaces["html"], "button"),
+    (namespaces["html"], "caption"),
     (namespaces["html"], "center"),
     (namespaces["html"], "col"),
     (namespaces["html"], "colgroup"),
     (namespaces["html"], "command"),
-    (namespaces["html"], "datagrid"),
     (namespaces["html"], "dd"),
     (namespaces["html"], "details"),
-    (namespaces["html"], "dialog"),
     (namespaces["html"], "dir"),
     (namespaces["html"], "div"),
     (namespaces["html"], "dl"),
     (namespaces["html"], "dt"),
     (namespaces["html"], "embed"),
-    (namespaces["html"], "event-source"),
     (namespaces["html"], "fieldset"),
     (namespaces["html"], "figure"),
     (namespaces["html"], "footer"),
@@ -349,25 +375,26 @@
     (namespaces["html"], "head"),
     (namespaces["html"], "header"),
     (namespaces["html"], "hr"),
+    (namespaces["html"], "html"),
     (namespaces["html"], "iframe"),
     # Note that image is commented out in the spec as "this isn't an
     # element that can end up on the stack, so it doesn't matter,"
-    (namespaces["html"], "image"), 
+    (namespaces["html"], "image"),
     (namespaces["html"], "img"),
     (namespaces["html"], "input"),
     (namespaces["html"], "isindex"),
     (namespaces["html"], "li"),
     (namespaces["html"], "link"),
     (namespaces["html"], "listing"),
+    (namespaces["html"], "marquee"),
     (namespaces["html"], "menu"),
     (namespaces["html"], "meta"),
     (namespaces["html"], "nav"),
     (namespaces["html"], "noembed"),
     (namespaces["html"], "noframes"),
     (namespaces["html"], "noscript"),
+    (namespaces["html"], "object"),
     (namespaces["html"], "ol"),
-    (namespaces["html"], "optgroup"),
-    (namespaces["html"], "option"),
     (namespaces["html"], "p"),
     (namespaces["html"], "param"),
     (namespaces["html"], "plaintext"),
@@ -375,24 +402,61 @@
     (namespaces["html"], "script"),
     (namespaces["html"], "section"),
     (namespaces["html"], "select"),
-    (namespaces["html"], "spacer"),
     (namespaces["html"], "style"),
+    (namespaces["html"], "table"),
     (namespaces["html"], "tbody"),
+    (namespaces["html"], "td"),
     (namespaces["html"], "textarea"),
     (namespaces["html"], "tfoot"),
+    (namespaces["html"], "th"),
     (namespaces["html"], "thead"),
     (namespaces["html"], "title"),
     (namespaces["html"], "tr"),
     (namespaces["html"], "ul"),
-    (namespaces["html"], "wbr")
+    (namespaces["html"], "wbr"),
+    (namespaces["html"], "xmp"),
+    (namespaces["svg"], "foreignObject")
+))
+
+htmlIntegrationPointElements = frozenset((
+    (namespaces["mathml"], "annotaion-xml"),
+    (namespaces["svg"], "foreignObject"),
+    (namespaces["svg"], "desc"),
+    (namespaces["svg"], "title")
+))
+
+mathmlTextIntegrationPointElements = frozenset((
+    (namespaces["mathml"], "mi"),
+    (namespaces["mathml"], "mo"),
+    (namespaces["mathml"], "mn"),
+    (namespaces["mathml"], "ms"),
+    (namespaces["mathml"], "mtext")
 ))
 
+adjustForeignAttributes = {
+    "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
+    "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
+    "xlink:href": ("xlink", "href", namespaces["xlink"]),
+    "xlink:role": ("xlink", "role", namespaces["xlink"]),
+    "xlink:show": ("xlink", "show", namespaces["xlink"]),
+    "xlink:title": ("xlink", "title", namespaces["xlink"]),
+    "xlink:type": ("xlink", "type", namespaces["xlink"]),
+    "xml:base": ("xml", "base", namespaces["xml"]),
+    "xml:lang": ("xml", "lang", namespaces["xml"]),
+    "xml:space": ("xml", "space", namespaces["xml"]),
+    "xmlns": (None, "xmlns", namespaces["xmlns"]),
+    "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
+}
+
+unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
+                                  adjustForeignAttributes.items()])
+
 spaceCharacters = frozenset((
-    u"\t",
-    u"\n",
-    u"\u000C",
-    u" ",
-    u"\r"
+    "\t",
+    "\n",
+    "\u000C",
+    " ",
+    "\r"
 ))
 
 tableInsertModeElements = frozenset((
@@ -409,8 +473,8 @@
 digits = frozenset(string.digits)
 hexDigits = frozenset(string.hexdigits)
 
-asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
-    for c in string.ascii_uppercase])
+asciiUpper2Lower = dict([(ord(c), ord(c.lower()))
+                         for c in string.ascii_uppercase])
 
 # Heading elements need to be ordered
 headingElements = (
@@ -436,7 +500,8 @@
     "area",
     "col",
     "input",
-    "source"
+    "source",
+    "track"
 ))
 
 cdataElements = frozenset(('title', 'textarea'))
@@ -455,12 +520,13 @@
     "": frozenset(("irrelevant",)),
     "style": frozenset(("scoped",)),
     "img": frozenset(("ismap",)),
-    "audio": frozenset(("autoplay","controls")),
-    "video": frozenset(("autoplay","controls")),
+    "audio": frozenset(("autoplay", "controls")),
+    "video": frozenset(("autoplay", "controls")),
     "script": frozenset(("defer", "async")),
     "details": frozenset(("open",)),
     "datagrid": frozenset(("multiple", "disabled")),
     "command": frozenset(("hidden", "disabled", "checked", "default")),
+    "hr": frozenset(("noshade")),
     "menu": frozenset(("autosubmit",)),
     "fieldset": frozenset(("disabled", "readonly")),
     "option": frozenset(("disabled", "readonly", "selected")),
@@ -474,447 +540,2312 @@
 # entitiesWindows1252 has to be _ordered_ and needs to have an index. It
 # therefore can't be a frozenset.
 entitiesWindows1252 = (
-    8364,  # 0x80  0x20AC  EURO SIGN
-    65533, # 0x81          UNDEFINED
-    8218,  # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
-    402,   # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
-    8222,  # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
-    8230,  # 0x85  0x2026  HORIZONTAL ELLIPSIS
-    8224,  # 0x86  0x2020  DAGGER
-    8225,  # 0x87  0x2021  DOUBLE DAGGER
-    710,   # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
-    8240,  # 0x89  0x2030  PER MILLE SIGN
-    352,   # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
-    8249,  # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
-    338,   # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
-    65533, # 0x8D          UNDEFINED
-    381,   # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
-    65533, # 0x8F          UNDEFINED
-    65533, # 0x90          UNDEFINED
-    8216,  # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
-    8217,  # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
-    8220,  # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
-    8221,  # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
-    8226,  # 0x95  0x2022  BULLET
-    8211,  # 0x96  0x2013  EN DASH
-    8212,  # 0x97  0x2014  EM DASH
-    732,   # 0x98  0x02DC  SMALL TILDE
-    8482,  # 0x99  0x2122  TRADE MARK SIGN
-    353,   # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
-    8250,  # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
-    339,   # 0x9C  0x0153  LATIN SMALL LIGATURE OE
-    65533, # 0x9D          UNDEFINED
-    382,   # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
-    376    # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
+    8364,   # 0x80  0x20AC  EURO SIGN
+    65533,  # 0x81          UNDEFINED
+    8218,   # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
+    402,    # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
+    8222,   # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
+    8230,   # 0x85  0x2026  HORIZONTAL ELLIPSIS
+    8224,   # 0x86  0x2020  DAGGER
+    8225,   # 0x87  0x2021  DOUBLE DAGGER
+    710,    # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
+    8240,   # 0x89  0x2030  PER MILLE SIGN
+    352,    # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
+    8249,   # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    338,    # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
+    65533,  # 0x8D          UNDEFINED
+    381,    # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
+    65533,  # 0x8F          UNDEFINED
+    65533,  # 0x90          UNDEFINED
+    8216,   # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
+    8217,   # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
+    8220,   # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
+    8221,   # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
+    8226,   # 0x95  0x2022  BULLET
+    8211,   # 0x96  0x2013  EN DASH
+    8212,   # 0x97  0x2014  EM DASH
+    732,    # 0x98  0x02DC  SMALL TILDE
+    8482,   # 0x99  0x2122  TRADE MARK SIGN
+    353,    # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
+    8250,   # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    339,    # 0x9C  0x0153  LATIN SMALL LIGATURE OE
+    65533,  # 0x9D          UNDEFINED
+    382,    # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
+    376     # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
 )
 
 xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
 
 entities = {
-    "AElig;": u"\u00C6",
-    "AElig": u"\u00C6",
-    "AMP;": u"\u0026",
-    "AMP": u"\u0026",
-    "Aacute;": u"\u00C1",
-    "Aacute": u"\u00C1",
-    "Acirc;": u"\u00C2",
-    "Acirc": u"\u00C2",
-    "Agrave;": u"\u00C0",
-    "Agrave": u"\u00C0",
-    "Alpha;": u"\u0391",
-    "Aring;": u"\u00C5",
-    "Aring": u"\u00C5",
-    "Atilde;": u"\u00C3",
-    "Atilde": u"\u00C3",
-    "Auml;": u"\u00C4",
-    "Auml": u"\u00C4",
-    "Beta;": u"\u0392",
-    "COPY;": u"\u00A9",
-    "COPY": u"\u00A9",
-    "Ccedil;": u"\u00C7",
-    "Ccedil": u"\u00C7",
-    "Chi;": u"\u03A7",
-    "Dagger;": u"\u2021",
-    "Delta;": u"\u0394",
-    "ETH;": u"\u00D0",
-    "ETH": u"\u00D0",
-    "Eacute;": u"\u00C9",
-    "Eacute": u"\u00C9",
-    "Ecirc;": u"\u00CA",
-    "Ecirc": u"\u00CA",
-    "Egrave;": u"\u00C8",
-    "Egrave": u"\u00C8",
-    "Epsilon;": u"\u0395",
-    "Eta;": u"\u0397",
-    "Euml;": u"\u00CB",
-    "Euml": u"\u00CB",
-    "GT;": u"\u003E",
-    "GT": u"\u003E",
-    "Gamma;": u"\u0393",
-    "Iacute;": u"\u00CD",
-    "Iacute": u"\u00CD",
-    "Icirc;": u"\u00CE",
-    "Icirc": u"\u00CE",
-    "Igrave;": u"\u00CC",
-    "Igrave": u"\u00CC",
-    "Iota;": u"\u0399",
-    "Iuml;": u"\u00CF",
-    "Iuml": u"\u00CF",
-    "Kappa;": u"\u039A",
-    "LT;": u"\u003C",
-    "LT": u"\u003C",
-    "Lambda;": u"\u039B",
-    "Mu;": u"\u039C",
-    "Ntilde;": u"\u00D1",
-    "Ntilde": u"\u00D1",
-    "Nu;": u"\u039D",
-    "OElig;": u"\u0152",
-    "Oacute;": u"\u00D3",
-    "Oacute": u"\u00D3",
-    "Ocirc;": u"\u00D4",
-    "Ocirc": u"\u00D4",
-    "Ograve;": u"\u00D2",
-    "Ograve": u"\u00D2",
-    "Omega;": u"\u03A9",
-    "Omicron;": u"\u039F",
-    "Oslash;": u"\u00D8",
-    "Oslash": u"\u00D8",
-    "Otilde;": u"\u00D5",
-    "Otilde": u"\u00D5",
-    "Ouml;": u"\u00D6",
-    "Ouml": u"\u00D6",
-    "Phi;": u"\u03A6",
-    "Pi;": u"\u03A0",
-    "Prime;": u"\u2033",
-    "Psi;": u"\u03A8",
-    "QUOT;": u"\u0022",
-    "QUOT": u"\u0022",
-    "REG;": u"\u00AE",
-    "REG": u"\u00AE",
-    "Rho;": u"\u03A1",
-    "Scaron;": u"\u0160",
-    "Sigma;": u"\u03A3",
-    "THORN;": u"\u00DE",
-    "THORN": u"\u00DE",
-    "TRADE;": u"\u2122",
-    "Tau;": u"\u03A4",
-    "Theta;": u"\u0398",
-    "Uacute;": u"\u00DA",
-    "Uacute": u"\u00DA",
-    "Ucirc;": u"\u00DB",
-    "Ucirc": u"\u00DB",
-    "Ugrave;": u"\u00D9",
-    "Ugrave": u"\u00D9",
-    "Upsilon;": u"\u03A5",
-    "Uuml;": u"\u00DC",
-    "Uuml": u"\u00DC",
-    "Xi;": u"\u039E",
-    "Yacute;": u"\u00DD",
-    "Yacute": u"\u00DD",
-    "Yuml;": u"\u0178",
-    "Zeta;": u"\u0396",
-    "aacute;": u"\u00E1",
-    "aacute": u"\u00E1",
-    "acirc;": u"\u00E2",
-    "acirc": u"\u00E2",
-    "acute;": u"\u00B4",
-    "acute": u"\u00B4",
-    "aelig;": u"\u00E6",
-    "aelig": u"\u00E6",
-    "agrave;": u"\u00E0",
-    "agrave": u"\u00E0",
-    "alefsym;": u"\u2135",
-    "alpha;": u"\u03B1",
-    "amp;": u"\u0026",
-    "amp": u"\u0026",
-    "and;": u"\u2227",
-    "ang;": u"\u2220",
-    "apos;": u"\u0027",
-    "aring;": u"\u00E5",
-    "aring": u"\u00E5",
-    "asymp;": u"\u2248",
-    "atilde;": u"\u00E3",
-    "atilde": u"\u00E3",
-    "auml;": u"\u00E4",
-    "auml": u"\u00E4",
-    "bdquo;": u"\u201E",
-    "beta;": u"\u03B2",
-    "brvbar;": u"\u00A6",
-    "brvbar": u"\u00A6",
-    "bull;": u"\u2022",
-    "cap;": u"\u2229",
-    "ccedil;": u"\u00E7",
-    "ccedil": u"\u00E7",
-    "cedil;": u"\u00B8",
-    "cedil": u"\u00B8",
-    "cent;": u"\u00A2",
-    "cent": u"\u00A2",
-    "chi;": u"\u03C7",
-    "circ;": u"\u02C6",
-    "clubs;": u"\u2663",
-    "cong;": u"\u2245",
-    "copy;": u"\u00A9",
-    "copy": u"\u00A9",
-    "crarr;": u"\u21B5",
-    "cup;": u"\u222A",
-    "curren;": u"\u00A4",
-    "curren": u"\u00A4",
-    "dArr;": u"\u21D3",
-    "dagger;": u"\u2020",
-    "darr;": u"\u2193",
-    "deg;": u"\u00B0",
-    "deg": u"\u00B0",
-    "delta;": u"\u03B4",
-    "diams;": u"\u2666",
-    "divide;": u"\u00F7",
-    "divide": u"\u00F7",
-    "eacute;": u"\u00E9",
-    "eacute": u"\u00E9",
-    "ecirc;": u"\u00EA",
-    "ecirc": u"\u00EA",
-    "egrave;": u"\u00E8",
-    "egrave": u"\u00E8",
-    "empty;": u"\u2205",
-    "emsp;": u"\u2003",
-    "ensp;": u"\u2002",
-    "epsilon;": u"\u03B5",
-    "equiv;": u"\u2261",
-    "eta;": u"\u03B7",
-    "eth;": u"\u00F0",
-    "eth": u"\u00F0",
-    "euml;": u"\u00EB",
-    "euml": u"\u00EB",
-    "euro;": u"\u20AC",
-    "exist;": u"\u2203",
-    "fnof;": u"\u0192",
-    "forall;": u"\u2200",
-    "frac12;": u"\u00BD",
-    "frac12": u"\u00BD",
-    "frac14;": u"\u00BC",
-    "frac14": u"\u00BC",
-    "frac34;": u"\u00BE",
-    "frac34": u"\u00BE",
-    "frasl;": u"\u2044",
-    "gamma;": u"\u03B3",
-    "ge;": u"\u2265",
-    "gt;": u"\u003E",
-    "gt": u"\u003E",
-    "hArr;": u"\u21D4",
-    "harr;": u"\u2194",
-    "hearts;": u"\u2665",
-    "hellip;": u"\u2026",
-    "iacute;": u"\u00ED",
-    "iacute": u"\u00ED",
-    "icirc;": u"\u00EE",
-    "icirc": u"\u00EE",
-    "iexcl;": u"\u00A1",
-    "iexcl": u"\u00A1",
-    "igrave;": u"\u00EC",
-    "igrave": u"\u00EC",
-    "image;": u"\u2111",
-    "infin;": u"\u221E",
-    "int;": u"\u222B",
-    "iota;": u"\u03B9",
-    "iquest;": u"\u00BF",
-    "iquest": u"\u00BF",
-    "isin;": u"\u2208",
-    "iuml;": u"\u00EF",
-    "iuml": u"\u00EF",
-    "kappa;": u"\u03BA",
-    "lArr;": u"\u21D0",
-    "lambda;": u"\u03BB",
-    "lang;": u"\u27E8",
-    "laquo;": u"\u00AB",
-    "laquo": u"\u00AB",
-    "larr;": u"\u2190",
-    "lceil;": u"\u2308",
-    "ldquo;": u"\u201C",
-    "le;": u"\u2264",
-    "lfloor;": u"\u230A",
-    "lowast;": u"\u2217",
-    "loz;": u"\u25CA",
-    "lrm;": u"\u200E",
-    "lsaquo;": u"\u2039",
-    "lsquo;": u"\u2018",
-    "lt;": u"\u003C",
-    "lt": u"\u003C",
-    "macr;": u"\u00AF",
-    "macr": u"\u00AF",
-    "mdash;": u"\u2014",
-    "micro;": u"\u00B5",
-    "micro": u"\u00B5",
-    "middot;": u"\u00B7",
-    "middot": u"\u00B7",
-    "minus;": u"\u2212",
-    "mu;": u"\u03BC",
-    "nabla;": u"\u2207",
-    "nbsp;": u"\u00A0",
-    "nbsp": u"\u00A0",
-    "ndash;": u"\u2013",
-    "ne;": u"\u2260",
-    "ni;": u"\u220B",
-    "not;": u"\u00AC",
-    "not": u"\u00AC",
-    "notin;": u"\u2209",
-    "nsub;": u"\u2284",
-    "ntilde;": u"\u00F1",
-    "ntilde": u"\u00F1",
-    "nu;": u"\u03BD",
-    "oacute;": u"\u00F3",
-    "oacute": u"\u00F3",
-    "ocirc;": u"\u00F4",
-    "ocirc": u"\u00F4",
-    "oelig;": u"\u0153",
-    "ograve;": u"\u00F2",
-    "ograve": u"\u00F2",
-    "oline;": u"\u203E",
-    "omega;": u"\u03C9",
-    "omicron;": u"\u03BF",
-    "oplus;": u"\u2295",
-    "or;": u"\u2228",
-    "ordf;": u"\u00AA",
-    "ordf": u"\u00AA",
-    "ordm;": u"\u00BA",
-    "ordm": u"\u00BA",
-    "oslash;": u"\u00F8",
-    "oslash": u"\u00F8",
-    "otilde;": u"\u00F5",
-    "otilde": u"\u00F5",
-    "otimes;": u"\u2297",
-    "ouml;": u"\u00F6",
-    "ouml": u"\u00F6",
-    "para;": u"\u00B6",
-    "para": u"\u00B6",
-    "part;": u"\u2202",
-    "permil;": u"\u2030",
-    "perp;": u"\u22A5",
-    "phi;": u"\u03C6",
-    "pi;": u"\u03C0",
-    "piv;": u"\u03D6",
-    "plusmn;": u"\u00B1",
-    "plusmn": u"\u00B1",
-    "pound;": u"\u00A3",
-    "pound": u"\u00A3",
-    "prime;": u"\u2032",
-    "prod;": u"\u220F",
-    "prop;": u"\u221D",
-    "psi;": u"\u03C8",
-    "quot;": u"\u0022",
-    "quot": u"\u0022",
-    "rArr;": u"\u21D2",
-    "radic;": u"\u221A",
-    "rang;": u"\u27E9",
-    "raquo;": u"\u00BB",
-    "raquo": u"\u00BB",
-    "rarr;": u"\u2192",
-    "rceil;": u"\u2309",
-    "rdquo;": u"\u201D",
-    "real;": u"\u211C",
-    "reg;": u"\u00AE",
-    "reg": u"\u00AE",
-    "rfloor;": u"\u230B",
-    "rho;": u"\u03C1",
-    "rlm;": u"\u200F",
-    "rsaquo;": u"\u203A",
-    "rsquo;": u"\u2019",
-    "sbquo;": u"\u201A",
-    "scaron;": u"\u0161",
-    "sdot;": u"\u22C5",
-    "sect;": u"\u00A7",
-    "sect": u"\u00A7",
-    "shy;": u"\u00AD",
-    "shy": u"\u00AD",
-    "sigma;": u"\u03C3",
-    "sigmaf;": u"\u03C2",
-    "sim;": u"\u223C",
-    "spades;": u"\u2660",
-    "sub;": u"\u2282",
-    "sube;": u"\u2286",
-    "sum;": u"\u2211",
-    "sup1;": u"\u00B9",
-    "sup1": u"\u00B9",
-    "sup2;": u"\u00B2",
-    "sup2": u"\u00B2",
-    "sup3;": u"\u00B3",
-    "sup3": u"\u00B3",
-    "sup;": u"\u2283",
-    "supe;": u"\u2287",
-    "szlig;": u"\u00DF",
-    "szlig": u"\u00DF",
-    "tau;": u"\u03C4",
-    "there4;": u"\u2234",
-    "theta;": u"\u03B8",
-    "thetasym;": u"\u03D1",
-    "thinsp;": u"\u2009",
-    "thorn;": u"\u00FE",
-    "thorn": u"\u00FE",
-    "tilde;": u"\u02DC",
-    "times;": u"\u00D7",
-    "times": u"\u00D7",
-    "trade;": u"\u2122",
-    "uArr;": u"\u21D1",
-    "uacute;": u"\u00FA",
-    "uacute": u"\u00FA",
-    "uarr;": u"\u2191",
-    "ucirc;": u"\u00FB",
-    "ucirc": u"\u00FB",
-    "ugrave;": u"\u00F9",
-    "ugrave": u"\u00F9",
-    "uml;": u"\u00A8",
-    "uml": u"\u00A8",
-    "upsih;": u"\u03D2",
-    "upsilon;": u"\u03C5",
-    "uuml;": u"\u00FC",
-    "uuml": u"\u00FC",
-    "weierp;": u"\u2118",
-    "xi;": u"\u03BE",
-    "yacute;": u"\u00FD",
-    "yacute": u"\u00FD",
-    "yen;": u"\u00A5",
-    "yen": u"\u00A5",
-    "yuml;": u"\u00FF",
-    "yuml": u"\u00FF",
-    "zeta;": u"\u03B6",
-    "zwj;": u"\u200D",
-    "zwnj;": u"\u200C"
+    "AElig": "\xc6",
+    "AElig;": "\xc6",
+    "AMP": "&",
+    "AMP;": "&",
+    "Aacute": "\xc1",
+    "Aacute;": "\xc1",
+    "Abreve;": "\u0102",
+    "Acirc": "\xc2",
+    "Acirc;": "\xc2",
+    "Acy;": "\u0410",
+    "Afr;": "\U0001d504",
+    "Agrave": "\xc0",
+    "Agrave;": "\xc0",
+    "Alpha;": "\u0391",
+    "Amacr;": "\u0100",
+    "And;": "\u2a53",
+    "Aogon;": "\u0104",
+    "Aopf;": "\U0001d538",
+    "ApplyFunction;": "\u2061",
+    "Aring": "\xc5",
+    "Aring;": "\xc5",
+    "Ascr;": "\U0001d49c",
+    "Assign;": "\u2254",
+    "Atilde": "\xc3",
+    "Atilde;": "\xc3",
+    "Auml": "\xc4",
+    "Auml;": "\xc4",
+    "Backslash;": "\u2216",
+    "Barv;": "\u2ae7",
+    "Barwed;": "\u2306",
+    "Bcy;": "\u0411",
+    "Because;": "\u2235",
+    "Bernoullis;": "\u212c",
+    "Beta;": "\u0392",
+    "Bfr;": "\U0001d505",
+    "Bopf;": "\U0001d539",
+    "Breve;": "\u02d8",
+    "Bscr;": "\u212c",
+    "Bumpeq;": "\u224e",
+    "CHcy;": "\u0427",
+    "COPY": "\xa9",
+    "COPY;": "\xa9",
+    "Cacute;": "\u0106",
+    "Cap;": "\u22d2",
+    "CapitalDifferentialD;": "\u2145",
+    "Cayleys;": "\u212d",
+    "Ccaron;": "\u010c",
+    "Ccedil": "\xc7",
+    "Ccedil;": "\xc7",
+    "Ccirc;": "\u0108",
+    "Cconint;": "\u2230",
+    "Cdot;": "\u010a",
+    "Cedilla;": "\xb8",
+    "CenterDot;": "\xb7",
+    "Cfr;": "\u212d",
+    "Chi;": "\u03a7",
+    "CircleDot;": "\u2299",
+    "CircleMinus;": "\u2296",
+    "CirclePlus;": "\u2295",
+    "CircleTimes;": "\u2297",
+    "ClockwiseContourIntegral;": "\u2232",
+    "CloseCurlyDoubleQuote;": "\u201d",
+    "CloseCurlyQuote;": "\u2019",
+    "Colon;": "\u2237",
+    "Colone;": "\u2a74",
+    "Congruent;": "\u2261",
+    "Conint;": "\u222f",
+    "ContourIntegral;": "\u222e",
+    "Copf;": "\u2102",
+    "Coproduct;": "\u2210",
+    "CounterClockwiseContourIntegral;": "\u2233",
+    "Cross;": "\u2a2f",
+    "Cscr;": "\U0001d49e",
+    "Cup;": "\u22d3",
+    "CupCap;": "\u224d",
+    "DD;": "\u2145",
+    "DDotrahd;": "\u2911",
+    "DJcy;": "\u0402",
+    "DScy;": "\u0405",
+    "DZcy;": "\u040f",
+    "Dagger;": "\u2021",
+    "Darr;": "\u21a1",
+    "Dashv;": "\u2ae4",
+    "Dcaron;": "\u010e",
+    "Dcy;": "\u0414",
+    "Del;": "\u2207",
+    "Delta;": "\u0394",
+    "Dfr;": "\U0001d507",
+    "DiacriticalAcute;": "\xb4",
+    "DiacriticalDot;": "\u02d9",
+    "DiacriticalDoubleAcute;": "\u02dd",
+    "DiacriticalGrave;": "`",
+    "DiacriticalTilde;": "\u02dc",
+    "Diamond;": "\u22c4",
+    "DifferentialD;": "\u2146",
+    "Dopf;": "\U0001d53b",
+    "Dot;": "\xa8",
+    "DotDot;": "\u20dc",
+    "DotEqual;": "\u2250",
+    "DoubleContourIntegral;": "\u222f",
+    "DoubleDot;": "\xa8",
+    "DoubleDownArrow;": "\u21d3",
+    "DoubleLeftArrow;": "\u21d0",
+    "DoubleLeftRightArrow;": "\u21d4",
+    "DoubleLeftTee;": "\u2ae4",
+    "DoubleLongLeftArrow;": "\u27f8",
+    "DoubleLongLeftRightArrow;": "\u27fa",
+    "DoubleLongRightArrow;": "\u27f9",
+    "DoubleRightArrow;": "\u21d2",
+    "DoubleRightTee;": "\u22a8",
+    "DoubleUpArrow;": "\u21d1",
+    "DoubleUpDownArrow;": "\u21d5",
+    "DoubleVerticalBar;": "\u2225",
+    "DownArrow;": "\u2193",
+    "DownArrowBar;": "\u2913",
+    "DownArrowUpArrow;": "\u21f5",
+    "DownBreve;": "\u0311",
+    "DownLeftRightVector;": "\u2950",
+    "DownLeftTeeVector;": "\u295e",
+    "DownLeftVector;": "\u21bd",
+    "DownLeftVectorBar;": "\u2956",
+    "DownRightTeeVector;": "\u295f",
+    "DownRightVector;": "\u21c1",
+    "DownRightVectorBar;": "\u2957",
+    "DownTee;": "\u22a4",
+    "DownTeeArrow;": "\u21a7",
+    "Downarrow;": "\u21d3",
+    "Dscr;": "\U0001d49f",
+    "Dstrok;": "\u0110",
+    "ENG;": "\u014a",
+    "ETH": "\xd0",
+    "ETH;": "\xd0",
+    "Eacute": "\xc9",
+    "Eacute;": "\xc9",
+    "Ecaron;": "\u011a",
+    "Ecirc": "\xca",
+    "Ecirc;": "\xca",
+    "Ecy;": "\u042d",
+    "Edot;": "\u0116",
+    "Efr;": "\U0001d508",
+    "Egrave": "\xc8",
+    "Egrave;": "\xc8",
+    "Element;": "\u2208",
+    "Emacr;": "\u0112",
+    "EmptySmallSquare;": "\u25fb",
+    "EmptyVerySmallSquare;": "\u25ab",
+    "Eogon;": "\u0118",
+    "Eopf;": "\U0001d53c",
+    "Epsilon;": "\u0395",
+    "Equal;": "\u2a75",
+    "EqualTilde;": "\u2242",
+    "Equilibrium;": "\u21cc",
+    "Escr;": "\u2130",
+    "Esim;": "\u2a73",
+    "Eta;": "\u0397",
+    "Euml": "\xcb",
+    "Euml;": "\xcb",
+    "Exists;": "\u2203",
+    "ExponentialE;": "\u2147",
+    "Fcy;": "\u0424",
+    "Ffr;": "\U0001d509",
+    "FilledSmallSquare;": "\u25fc",
+    "FilledVerySmallSquare;": "\u25aa",
+    "Fopf;": "\U0001d53d",
+    "ForAll;": "\u2200",
+    "Fouriertrf;": "\u2131",
+    "Fscr;": "\u2131",
+    "GJcy;": "\u0403",
+    "GT": ">",
+    "GT;": ">",
+    "Gamma;": "\u0393",
+    "Gammad;": "\u03dc",
+    "Gbreve;": "\u011e",
+    "Gcedil;": "\u0122",
+    "Gcirc;": "\u011c",
+    "Gcy;": "\u0413",
+    "Gdot;": "\u0120",
+    "Gfr;": "\U0001d50a",
+    "Gg;": "\u22d9",
+    "Gopf;": "\U0001d53e",
+    "GreaterEqual;": "\u2265",
+    "GreaterEqualLess;": "\u22db",
+    "GreaterFullEqual;": "\u2267",
+    "GreaterGreater;": "\u2aa2",
+    "GreaterLess;": "\u2277",
+    "GreaterSlantEqual;": "\u2a7e",
+    "GreaterTilde;": "\u2273",
+    "Gscr;": "\U0001d4a2",
+    "Gt;": "\u226b",
+    "HARDcy;": "\u042a",
+    "Hacek;": "\u02c7",
+    "Hat;": "^",
+    "Hcirc;": "\u0124",
+    "Hfr;": "\u210c",
+    "HilbertSpace;": "\u210b",
+    "Hopf;": "\u210d",
+    "HorizontalLine;": "\u2500",
+    "Hscr;": "\u210b",
+    "Hstrok;": "\u0126",
+    "HumpDownHump;": "\u224e",
+    "HumpEqual;": "\u224f",
+    "IEcy;": "\u0415",
+    "IJlig;": "\u0132",
+    "IOcy;": "\u0401",
+    "Iacute": "\xcd",
+    "Iacute;": "\xcd",
+    "Icirc": "\xce",
+    "Icirc;": "\xce",
+    "Icy;": "\u0418",
+    "Idot;": "\u0130",
+    "Ifr;": "\u2111",
+    "Igrave": "\xcc",
+    "Igrave;": "\xcc",
+    "Im;": "\u2111",
+    "Imacr;": "\u012a",
+    "ImaginaryI;": "\u2148",
+    "Implies;": "\u21d2",
+    "Int;": "\u222c",
+    "Integral;": "\u222b",
+    "Intersection;": "\u22c2",
+    "InvisibleComma;": "\u2063",
+    "InvisibleTimes;": "\u2062",
+    "Iogon;": "\u012e",
+    "Iopf;": "\U0001d540",
+    "Iota;": "\u0399",
+    "Iscr;": "\u2110",
+    "Itilde;": "\u0128",
+    "Iukcy;": "\u0406",
+    "Iuml": "\xcf",
+    "Iuml;": "\xcf",
+    "Jcirc;": "\u0134",
+    "Jcy;": "\u0419",
+    "Jfr;": "\U0001d50d",
+    "Jopf;": "\U0001d541",
+    "Jscr;": "\U0001d4a5",
+    "Jsercy;": "\u0408",
+    "Jukcy;": "\u0404",
+    "KHcy;": "\u0425",
+    "KJcy;": "\u040c",
+    "Kappa;": "\u039a",
+    "Kcedil;": "\u0136",
+    "Kcy;": "\u041a",
+    "Kfr;": "\U0001d50e",
+    "Kopf;": "\U0001d542",
+    "Kscr;": "\U0001d4a6",
+    "LJcy;": "\u0409",
+    "LT": "<",
+    "LT;": "<",
+    "Lacute;": "\u0139",
+    "Lambda;": "\u039b",
+    "Lang;": "\u27ea",
+    "Laplacetrf;": "\u2112",
+    "Larr;": "\u219e",
+    "Lcaron;": "\u013d",
+    "Lcedil;": "\u013b",
+    "Lcy;": "\u041b",
+    "LeftAngleBracket;": "\u27e8",
+    "LeftArrow;": "\u2190",
+    "LeftArrowBar;": "\u21e4",
+    "LeftArrowRightArrow;": "\u21c6",
+    "LeftCeiling;": "\u2308",
+    "LeftDoubleBracket;": "\u27e6",
+    "LeftDownTeeVector;": "\u2961",
+    "LeftDownVector;": "\u21c3",
+    "LeftDownVectorBar;": "\u2959",
+    "LeftFloor;": "\u230a",
+    "LeftRightArrow;": "\u2194",
+    "LeftRightVector;": "\u294e",
+    "LeftTee;": "\u22a3",
+    "LeftTeeArrow;": "\u21a4",
+    "LeftTeeVector;": "\u295a",
+    "LeftTriangle;": "\u22b2",
+    "LeftTriangleBar;": "\u29cf",
+    "LeftTriangleEqual;": "\u22b4",
+    "LeftUpDownVector;": "\u2951",
+    "LeftUpTeeVector;": "\u2960",
+    "LeftUpVector;": "\u21bf",
+    "LeftUpVectorBar;": "\u2958",
+    "LeftVector;": "\u21bc",
+    "LeftVectorBar;": "\u2952",
+    "Leftarrow;": "\u21d0",
+    "Leftrightarrow;": "\u21d4",
+    "LessEqualGreater;": "\u22da",
+    "LessFullEqual;": "\u2266",
+    "LessGreater;": "\u2276",
+    "LessLess;": "\u2aa1",
+    "LessSlantEqual;": "\u2a7d",
+    "LessTilde;": "\u2272",
+    "Lfr;": "\U0001d50f",
+    "Ll;": "\u22d8",
+    "Lleftarrow;": "\u21da",
+    "Lmidot;": "\u013f",
+    "LongLeftArrow;": "\u27f5",
+    "LongLeftRightArrow;": "\u27f7",
+    "LongRightArrow;": "\u27f6",
+    "Longleftarrow;": "\u27f8",
+    "Longleftrightarrow;": "\u27fa",
+    "Longrightarrow;": "\u27f9",
+    "Lopf;": "\U0001d543",
+    "LowerLeftArrow;": "\u2199",
+    "LowerRightArrow;": "\u2198",
+    "Lscr;": "\u2112",
+    "Lsh;": "\u21b0",
+    "Lstrok;": "\u0141",
+    "Lt;": "\u226a",
+    "Map;": "\u2905",
+    "Mcy;": "\u041c",
+    "MediumSpace;": "\u205f",
+    "Mellintrf;": "\u2133",
+    "Mfr;": "\U0001d510",
+    "MinusPlus;": "\u2213",
+    "Mopf;": "\U0001d544",
+    "Mscr;": "\u2133",
+    "Mu;": "\u039c",
+    "NJcy;": "\u040a",
+    "Nacute;": "\u0143",
+    "Ncaron;": "\u0147",
+    "Ncedil;": "\u0145",
+    "Ncy;": "\u041d",
+    "NegativeMediumSpace;": "\u200b",
+    "NegativeThickSpace;": "\u200b",
+    "NegativeThinSpace;": "\u200b",
+    "NegativeVeryThinSpace;": "\u200b",
+    "NestedGreaterGreater;": "\u226b",
+    "NestedLessLess;": "\u226a",
+    "NewLine;": "\n",
+    "Nfr;": "\U0001d511",
+    "NoBreak;": "\u2060",
+    "NonBreakingSpace;": "\xa0",
+    "Nopf;": "\u2115",
+    "Not;": "\u2aec",
+    "NotCongruent;": "\u2262",
+    "NotCupCap;": "\u226d",
+    "NotDoubleVerticalBar;": "\u2226",
+    "NotElement;": "\u2209",
+    "NotEqual;": "\u2260",
+    "NotEqualTilde;": "\u2242\u0338",
+    "NotExists;": "\u2204",
+    "NotGreater;": "\u226f",
+    "NotGreaterEqual;": "\u2271",
+    "NotGreaterFullEqual;": "\u2267\u0338",
+    "NotGreaterGreater;": "\u226b\u0338",
+    "NotGreaterLess;": "\u2279",
+    "NotGreaterSlantEqual;": "\u2a7e\u0338",
+    "NotGreaterTilde;": "\u2275",
+    "NotHumpDownHump;": "\u224e\u0338",
+    "NotHumpEqual;": "\u224f\u0338",
+    "NotLeftTriangle;": "\u22ea",
+    "NotLeftTriangleBar;": "\u29cf\u0338",
+    "NotLeftTriangleEqual;": "\u22ec",
+    "NotLess;": "\u226e",
+    "NotLessEqual;": "\u2270",
+    "NotLessGreater;": "\u2278",
+    "NotLessLess;": "\u226a\u0338",
+    "NotLessSlantEqual;": "\u2a7d\u0338",
+    "NotLessTilde;": "\u2274",
+    "NotNestedGreaterGreater;": "\u2aa2\u0338",
+    "NotNestedLessLess;": "\u2aa1\u0338",
+    "NotPrecedes;": "\u2280",
+    "NotPrecedesEqual;": "\u2aaf\u0338",
+    "NotPrecedesSlantEqual;": "\u22e0",
+    "NotReverseElement;": "\u220c",
+    "NotRightTriangle;": "\u22eb",
+    "NotRightTriangleBar;": "\u29d0\u0338",
+    "NotRightTriangleEqual;": "\u22ed",
+    "NotSquareSubset;": "\u228f\u0338",
+    "NotSquareSubsetEqual;": "\u22e2",
+    "NotSquareSuperset;": "\u2290\u0338",
+    "NotSquareSupersetEqual;": "\u22e3",
+    "NotSubset;": "\u2282\u20d2",
+    "NotSubsetEqual;": "\u2288",
+    "NotSucceeds;": "\u2281",
+    "NotSucceedsEqual;": "\u2ab0\u0338",
+    "NotSucceedsSlantEqual;": "\u22e1",
+    "NotSucceedsTilde;": "\u227f\u0338",
+    "NotSuperset;": "\u2283\u20d2",
+    "NotSupersetEqual;": "\u2289",
+    "NotTilde;": "\u2241",
+    "NotTildeEqual;": "\u2244",
+    "NotTildeFullEqual;": "\u2247",
+    "NotTildeTilde;": "\u2249",
+    "NotVerticalBar;": "\u2224",
+    "Nscr;": "\U0001d4a9",
+    "Ntilde": "\xd1",
+    "Ntilde;": "\xd1",
+    "Nu;": "\u039d",
+    "OElig;": "\u0152",
+    "Oacute": "\xd3",
+    "Oacute;": "\xd3",
+    "Ocirc": "\xd4",
+    "Ocirc;": "\xd4",
+    "Ocy;": "\u041e",
+    "Odblac;": "\u0150",
+    "Ofr;": "\U0001d512",
+    "Ograve": "\xd2",
+    "Ograve;": "\xd2",
+    "Omacr;": "\u014c",
+    "Omega;": "\u03a9",
+    "Omicron;": "\u039f",
+    "Oopf;": "\U0001d546",
+    "OpenCurlyDoubleQuote;": "\u201c",
+    "OpenCurlyQuote;": "\u2018",
+    "Or;": "\u2a54",
+    "Oscr;": "\U0001d4aa",
+    "Oslash": "\xd8",
+    "Oslash;": "\xd8",
+    "Otilde": "\xd5",
+    "Otilde;": "\xd5",
+    "Otimes;": "\u2a37",
+    "Ouml": "\xd6",
+    "Ouml;": "\xd6",
+    "OverBar;": "\u203e",
+    "OverBrace;": "\u23de",
+    "OverBracket;": "\u23b4",
+    "OverParenthesis;": "\u23dc",
+    "PartialD;": "\u2202",
+    "Pcy;": "\u041f",
+    "Pfr;": "\U0001d513",
+    "Phi;": "\u03a6",
+    "Pi;": "\u03a0",
+    "PlusMinus;": "\xb1",
+    "Poincareplane;": "\u210c",
+    "Popf;": "\u2119",
+    "Pr;": "\u2abb",
+    "Precedes;": "\u227a",
+    "PrecedesEqual;": "\u2aaf",
+    "PrecedesSlantEqual;": "\u227c",
+    "PrecedesTilde;": "\u227e",
+    "Prime;": "\u2033",
+    "Product;": "\u220f",
+    "Proportion;": "\u2237",
+    "Proportional;": "\u221d",
+    "Pscr;": "\U0001d4ab",
+    "Psi;": "\u03a8",
+    "QUOT": "\"",
+    "QUOT;": "\"",
+    "Qfr;": "\U0001d514",
+    "Qopf;": "\u211a",
+    "Qscr;": "\U0001d4ac",
+    "RBarr;": "\u2910",
+    "REG": "\xae",
+    "REG;": "\xae",
+    "Racute;": "\u0154",
+    "Rang;": "\u27eb",
+    "Rarr;": "\u21a0",
+    "Rarrtl;": "\u2916",
+    "Rcaron;": "\u0158",
+    "Rcedil;": "\u0156",
+    "Rcy;": "\u0420",
+    "Re;": "\u211c",
+    "ReverseElement;": "\u220b",
+    "ReverseEquilibrium;": "\u21cb",
+    "ReverseUpEquilibrium;": "\u296f",
+    "Rfr;": "\u211c",
+    "Rho;": "\u03a1",
+    "RightAngleBracket;": "\u27e9",
+    "RightArrow;": "\u2192",
+    "RightArrowBar;": "\u21e5",
+    "RightArrowLeftArrow;": "\u21c4",
+    "RightCeiling;": "\u2309",
+    "RightDoubleBracket;": "\u27e7",
+    "RightDownTeeVector;": "\u295d",
+    "RightDownVector;": "\u21c2",
+    "RightDownVectorBar;": "\u2955",
+    "RightFloor;": "\u230b",
+    "RightTee;": "\u22a2",
+    "RightTeeArrow;": "\u21a6",
+    "RightTeeVector;": "\u295b",
+    "RightTriangle;": "\u22b3",
+    "RightTriangleBar;": "\u29d0",
+    "RightTriangleEqual;": "\u22b5",
+    "RightUpDownVector;": "\u294f",
+    "RightUpTeeVector;": "\u295c",
+    "RightUpVector;": "\u21be",
+    "RightUpVectorBar;": "\u2954",
+    "RightVector;": "\u21c0",
+    "RightVectorBar;": "\u2953",
+    "Rightarrow;": "\u21d2",
+    "Ropf;": "\u211d",
+    "RoundImplies;": "\u2970",
+    "Rrightarrow;": "\u21db",
+    "Rscr;": "\u211b",
+    "Rsh;": "\u21b1",
+    "RuleDelayed;": "\u29f4",
+    "SHCHcy;": "\u0429",
+    "SHcy;": "\u0428",
+    "SOFTcy;": "\u042c",
+    "Sacute;": "\u015a",
+    "Sc;": "\u2abc",
+    "Scaron;": "\u0160",
+    "Scedil;": "\u015e",
+    "Scirc;": "\u015c",
+    "Scy;": "\u0421",
+    "Sfr;": "\U0001d516",
+    "ShortDownArrow;": "\u2193",
+    "ShortLeftArrow;": "\u2190",
+    "ShortRightArrow;": "\u2192",
+    "ShortUpArrow;": "\u2191",
+    "Sigma;": "\u03a3",
+    "SmallCircle;": "\u2218",
+    "Sopf;": "\U0001d54a",
+    "Sqrt;": "\u221a",
+    "Square;": "\u25a1",
+    "SquareIntersection;": "\u2293",
+    "SquareSubset;": "\u228f",
+    "SquareSubsetEqual;": "\u2291",
+    "SquareSuperset;": "\u2290",
+    "SquareSupersetEqual;": "\u2292",
+    "SquareUnion;": "\u2294",
+    "Sscr;": "\U0001d4ae",
+    "Star;": "\u22c6",
+    "Sub;": "\u22d0",
+    "Subset;": "\u22d0",
+    "SubsetEqual;": "\u2286",
+    "Succeeds;": "\u227b",
+    "SucceedsEqual;": "\u2ab0",
+    "SucceedsSlantEqual;": "\u227d",
+    "SucceedsTilde;": "\u227f",
+    "SuchThat;": "\u220b",
+    "Sum;": "\u2211",
+    "Sup;": "\u22d1",
+    "Superset;": "\u2283",
+    "SupersetEqual;": "\u2287",
+    "Supset;": "\u22d1",
+    "THORN": "\xde",
+    "THORN;": "\xde",
+    "TRADE;": "\u2122",
+    "TSHcy;": "\u040b",
+    "TScy;": "\u0426",
+    "Tab;": "\t",
+    "Tau;": "\u03a4",
+    "Tcaron;": "\u0164",
+    "Tcedil;": "\u0162",
+    "Tcy;": "\u0422",
+    "Tfr;": "\U0001d517",
+    "Therefore;": "\u2234",
+    "Theta;": "\u0398",
+    "ThickSpace;": "\u205f\u200a",
+    "ThinSpace;": "\u2009",
+    "Tilde;": "\u223c",
+    "TildeEqual;": "\u2243",
+    "TildeFullEqual;": "\u2245",
+    "TildeTilde;": "\u2248",
+    "Topf;": "\U0001d54b",
+    "TripleDot;": "\u20db",
+    "Tscr;": "\U0001d4af",
+    "Tstrok;": "\u0166",
+    "Uacute": "\xda",
+    "Uacute;": "\xda",
+    "Uarr;": "\u219f",
+    "Uarrocir;": "\u2949",
+    "Ubrcy;": "\u040e",
+    "Ubreve;": "\u016c",
+    "Ucirc": "\xdb",
+    "Ucirc;": "\xdb",
+    "Ucy;": "\u0423",
+    "Udblac;": "\u0170",
+    "Ufr;": "\U0001d518",
+    "Ugrave": "\xd9",
+    "Ugrave;": "\xd9",
+    "Umacr;": "\u016a",
+    "UnderBar;": "_",
+    "UnderBrace;": "\u23df",
+    "UnderBracket;": "\u23b5",
+    "UnderParenthesis;": "\u23dd",
+    "Union;": "\u22c3",
+    "UnionPlus;": "\u228e",
+    "Uogon;": "\u0172",
+    "Uopf;": "\U0001d54c",
+    "UpArrow;": "\u2191",
+    "UpArrowBar;": "\u2912",
+    "UpArrowDownArrow;": "\u21c5",
+    "UpDownArrow;": "\u2195",
+    "UpEquilibrium;": "\u296e",
+    "UpTee;": "\u22a5",
+    "UpTeeArrow;": "\u21a5",
+    "Uparrow;": "\u21d1",
+    "Updownarrow;": "\u21d5",
+    "UpperLeftArrow;": "\u2196",
+    "UpperRightArrow;": "\u2197",
+    "Upsi;": "\u03d2",
+    "Upsilon;": "\u03a5",
+    "Uring;": "\u016e",
+    "Uscr;": "\U0001d4b0",
+    "Utilde;": "\u0168",
+    "Uuml": "\xdc",
+    "Uuml;": "\xdc",
+    "VDash;": "\u22ab",
+    "Vbar;": "\u2aeb",
+    "Vcy;": "\u0412",
+    "Vdash;": "\u22a9",
+    "Vdashl;": "\u2ae6",
+    "Vee;": "\u22c1",
+    "Verbar;": "\u2016",
+    "Vert;": "\u2016",
+    "VerticalBar;": "\u2223",
+    "VerticalLine;": "|",
+    "VerticalSeparator;": "\u2758",
+    "VerticalTilde;": "\u2240",
+    "VeryThinSpace;": "\u200a",
+    "Vfr;": "\U0001d519",
+    "Vopf;": "\U0001d54d",
+    "Vscr;": "\U0001d4b1",
+    "Vvdash;": "\u22aa",
+    "Wcirc;": "\u0174",
+    "Wedge;": "\u22c0",
+    "Wfr;": "\U0001d51a",
+    "Wopf;": "\U0001d54e",
+    "Wscr;": "\U0001d4b2",
+    "Xfr;": "\U0001d51b",
+    "Xi;": "\u039e",
+    "Xopf;": "\U0001d54f",
+    "Xscr;": "\U0001d4b3",
+    "YAcy;": "\u042f",
+    "YIcy;": "\u0407",
+    "YUcy;": "\u042e",
+    "Yacute": "\xdd",
+    "Yacute;": "\xdd",
+    "Ycirc;": "\u0176",
+    "Ycy;": "\u042b",
+    "Yfr;": "\U0001d51c",
+    "Yopf;": "\U0001d550",
+    "Yscr;": "\U0001d4b4",
+    "Yuml;": "\u0178",
+    "ZHcy;": "\u0416",
+    "Zacute;": "\u0179",
+    "Zcaron;": "\u017d",
+    "Zcy;": "\u0417",
+    "Zdot;": "\u017b",
+    "ZeroWidthSpace;": "\u200b",
+    "Zeta;": "\u0396",
+    "Zfr;": "\u2128",
+    "Zopf;": "\u2124",
+    "Zscr;": "\U0001d4b5",
+    "aacute": "\xe1",
+    "aacute;": "\xe1",
+    "abreve;": "\u0103",
+    "ac;": "\u223e",
+    "acE;": "\u223e\u0333",
+    "acd;": "\u223f",
+    "acirc": "\xe2",
+    "acirc;": "\xe2",
+    "acute": "\xb4",
+    "acute;": "\xb4",
+    "acy;": "\u0430",
+    "aelig": "\xe6",
+    "aelig;": "\xe6",
+    "af;": "\u2061",
+    "afr;": "\U0001d51e",
+    "agrave": "\xe0",
+    "agrave;": "\xe0",
+    "alefsym;": "\u2135",
+    "aleph;": "\u2135",
+    "alpha;": "\u03b1",
+    "amacr;": "\u0101",
+    "amalg;": "\u2a3f",
+    "amp": "&",
+    "amp;": "&",
+    "and;": "\u2227",
+    "andand;": "\u2a55",
+    "andd;": "\u2a5c",
+    "andslope;": "\u2a58",
+    "andv;": "\u2a5a",
+    "ang;": "\u2220",
+    "ange;": "\u29a4",
+    "angle;": "\u2220",
+    "angmsd;": "\u2221",
+    "angmsdaa;": "\u29a8",
+    "angmsdab;": "\u29a9",
+    "angmsdac;": "\u29aa",
+    "angmsdad;": "\u29ab",
+    "angmsdae;": "\u29ac",
+    "angmsdaf;": "\u29ad",
+    "angmsdag;": "\u29ae",
+    "angmsdah;": "\u29af",
+    "angrt;": "\u221f",
+    "angrtvb;": "\u22be",
+    "angrtvbd;": "\u299d",
+    "angsph;": "\u2222",
+    "angst;": "\xc5",
+    "angzarr;": "\u237c",
+    "aogon;": "\u0105",
+    "aopf;": "\U0001d552",
+    "ap;": "\u2248",
+    "apE;": "\u2a70",
+    "apacir;": "\u2a6f",
+    "ape;": "\u224a",
+    "apid;": "\u224b",
+    "apos;": "'",
+    "approx;": "\u2248",
+    "approxeq;": "\u224a",
+    "aring": "\xe5",
+    "aring;": "\xe5",
+    "ascr;": "\U0001d4b6",
+    "ast;": "*",
+    "asymp;": "\u2248",
+    "asympeq;": "\u224d",
+    "atilde": "\xe3",
+    "atilde;": "\xe3",
+    "auml": "\xe4",
+    "auml;": "\xe4",
+    "awconint;": "\u2233",
+    "awint;": "\u2a11",
+    "bNot;": "\u2aed",
+    "backcong;": "\u224c",
+    "backepsilon;": "\u03f6",
+    "backprime;": "\u2035",
+    "backsim;": "\u223d",
+    "backsimeq;": "\u22cd",
+    "barvee;": "\u22bd",
+    "barwed;": "\u2305",
+    "barwedge;": "\u2305",
+    "bbrk;": "\u23b5",
+    "bbrktbrk;": "\u23b6",
+    "bcong;": "\u224c",
+    "bcy;": "\u0431",
+    "bdquo;": "\u201e",
+    "becaus;": "\u2235",
+    "because;": "\u2235",
+    "bemptyv;": "\u29b0",
+    "bepsi;": "\u03f6",
+    "bernou;": "\u212c",
+    "beta;": "\u03b2",
+    "beth;": "\u2136",
+    "between;": "\u226c",
+    "bfr;": "\U0001d51f",
+    "bigcap;": "\u22c2",
+    "bigcirc;": "\u25ef",
+    "bigcup;": "\u22c3",
+    "bigodot;": "\u2a00",
+    "bigoplus;": "\u2a01",
+    "bigotimes;": "\u2a02",
+    "bigsqcup;": "\u2a06",
+    "bigstar;": "\u2605",
+    "bigtriangledown;": "\u25bd",
+    "bigtriangleup;": "\u25b3",
+    "biguplus;": "\u2a04",
+    "bigvee;": "\u22c1",
+    "bigwedge;": "\u22c0",
+    "bkarow;": "\u290d",
+    "blacklozenge;": "\u29eb",
+    "blacksquare;": "\u25aa",
+    "blacktriangle;": "\u25b4",
+    "blacktriangledown;": "\u25be",
+    "blacktriangleleft;": "\u25c2",
+    "blacktriangleright;": "\u25b8",
+    "blank;": "\u2423",
+    "blk12;": "\u2592",
+    "blk14;": "\u2591",
+    "blk34;": "\u2593",
+    "block;": "\u2588",
+    "bne;": "=\u20e5",
+    "bnequiv;": "\u2261\u20e5",
+    "bnot;": "\u2310",
+    "bopf;": "\U0001d553",
+    "bot;": "\u22a5",
+    "bottom;": "\u22a5",
+    "bowtie;": "\u22c8",
+    "boxDL;": "\u2557",
+    "boxDR;": "\u2554",
+    "boxDl;": "\u2556",
+    "boxDr;": "\u2553",
+    "boxH;": "\u2550",
+    "boxHD;": "\u2566",
+    "boxHU;": "\u2569",
+    "boxHd;": "\u2564",
+    "boxHu;": "\u2567",
+    "boxUL;": "\u255d",
+    "boxUR;": "\u255a",
+    "boxUl;": "\u255c",
+    "boxUr;": "\u2559",
+    "boxV;": "\u2551",
+    "boxVH;": "\u256c",
+    "boxVL;": "\u2563",
+    "boxVR;": "\u2560",
+    "boxVh;": "\u256b",
+    "boxVl;": "\u2562",
+    "boxVr;": "\u255f",
+    "boxbox;": "\u29c9",
+    "boxdL;": "\u2555",
+    "boxdR;": "\u2552",
+    "boxdl;": "\u2510",
+    "boxdr;": "\u250c",
+    "boxh;": "\u2500",
+    "boxhD;": "\u2565",
+    "boxhU;": "\u2568",
+    "boxhd;": "\u252c",
+    "boxhu;": "\u2534",
+    "boxminus;": "\u229f",
+    "boxplus;": "\u229e",
+    "boxtimes;": "\u22a0",
+    "boxuL;": "\u255b",
+    "boxuR;": "\u2558",
+    "boxul;": "\u2518",
+    "boxur;": "\u2514",
+    "boxv;": "\u2502",
+    "boxvH;": "\u256a",
+    "boxvL;": "\u2561",
+    "boxvR;": "\u255e",
+    "boxvh;": "\u253c",
+    "boxvl;": "\u2524",
+    "boxvr;": "\u251c",
+    "bprime;": "\u2035",
+    "breve;": "\u02d8",
+    "brvbar": "\xa6",
+    "brvbar;": "\xa6",
+    "bscr;": "\U0001d4b7",
+    "bsemi;": "\u204f",
+    "bsim;": "\u223d",
+    "bsime;": "\u22cd",
+    "bsol;": "\\",
+    "bsolb;": "\u29c5",
+    "bsolhsub;": "\u27c8",
+    "bull;": "\u2022",
+    "bullet;": "\u2022",
+    "bump;": "\u224e",
+    "bumpE;": "\u2aae",
+    "bumpe;": "\u224f",
+    "bumpeq;": "\u224f",
+    "cacute;": "\u0107",
+    "cap;": "\u2229",
+    "capand;": "\u2a44",
+    "capbrcup;": "\u2a49",
+    "capcap;": "\u2a4b",
+    "capcup;": "\u2a47",
+    "capdot;": "\u2a40",
+    "caps;": "\u2229\ufe00",
+    "caret;": "\u2041",
+    "caron;": "\u02c7",
+    "ccaps;": "\u2a4d",
+    "ccaron;": "\u010d",
+    "ccedil": "\xe7",
+    "ccedil;": "\xe7",
+    "ccirc;": "\u0109",
+    "ccups;": "\u2a4c",
+    "ccupssm;": "\u2a50",
+    "cdot;": "\u010b",
+    "cedil": "\xb8",
+    "cedil;": "\xb8",
+    "cemptyv;": "\u29b2",
+    "cent": "\xa2",
+    "cent;": "\xa2",
+    "centerdot;": "\xb7",
+    "cfr;": "\U0001d520",
+    "chcy;": "\u0447",
+    "check;": "\u2713",
+    "checkmark;": "\u2713",
+    "chi;": "\u03c7",
+    "cir;": "\u25cb",
+    "cirE;": "\u29c3",
+    "circ;": "\u02c6",
+    "circeq;": "\u2257",
+    "circlearrowleft;": "\u21ba",
+    "circlearrowright;": "\u21bb",
+    "circledR;": "\xae",
+    "circledS;": "\u24c8",
+    "circledast;": "\u229b",
+    "circledcirc;": "\u229a",
+    "circleddash;": "\u229d",
+    "cire;": "\u2257",
+    "cirfnint;": "\u2a10",
+    "cirmid;": "\u2aef",
+    "cirscir;": "\u29c2",
+    "clubs;": "\u2663",
+    "clubsuit;": "\u2663",
+    "colon;": ":",
+    "colone;": "\u2254",
+    "coloneq;": "\u2254",
+    "comma;": ",",
+    "commat;": "@",
+    "comp;": "\u2201",
+    "compfn;": "\u2218",
+    "complement;": "\u2201",
+    "complexes;": "\u2102",
+    "cong;": "\u2245",
+    "congdot;": "\u2a6d",
+    "conint;": "\u222e",
+    "copf;": "\U0001d554",
+    "coprod;": "\u2210",
+    "copy": "\xa9",
+    "copy;": "\xa9",
+    "copysr;": "\u2117",
+    "crarr;": "\u21b5",
+    "cross;": "\u2717",
+    "cscr;": "\U0001d4b8",
+    "csub;": "\u2acf",
+    "csube;": "\u2ad1",
+    "csup;": "\u2ad0",
+    "csupe;": "\u2ad2",
+    "ctdot;": "\u22ef",
+    "cudarrl;": "\u2938",
+    "cudarrr;": "\u2935",
+    "cuepr;": "\u22de",
+    "cuesc;": "\u22df",
+    "cularr;": "\u21b6",
+    "cularrp;": "\u293d",
+    "cup;": "\u222a",
+    "cupbrcap;": "\u2a48",
+    "cupcap;": "\u2a46",
+    "cupcup;": "\u2a4a",
+    "cupdot;": "\u228d",
+    "cupor;": "\u2a45",
+    "cups;": "\u222a\ufe00",
+    "curarr;": "\u21b7",
+    "curarrm;": "\u293c",
+    "curlyeqprec;": "\u22de",
+    "curlyeqsucc;": "\u22df",
+    "curlyvee;": "\u22ce",
+    "curlywedge;": "\u22cf",
+    "curren": "\xa4",
+    "curren;": "\xa4",
+    "curvearrowleft;": "\u21b6",
+    "curvearrowright;": "\u21b7",
+    "cuvee;": "\u22ce",
+    "cuwed;": "\u22cf",
+    "cwconint;": "\u2232",
+    "cwint;": "\u2231",
+    "cylcty;": "\u232d",
+    "dArr;": "\u21d3",
+    "dHar;": "\u2965",
+    "dagger;": "\u2020",
+    "daleth;": "\u2138",
+    "darr;": "\u2193",
+    "dash;": "\u2010",
+    "dashv;": "\u22a3",
+    "dbkarow;": "\u290f",
+    "dblac;": "\u02dd",
+    "dcaron;": "\u010f",
+    "dcy;": "\u0434",
+    "dd;": "\u2146",
+    "ddagger;": "\u2021",
+    "ddarr;": "\u21ca",
+    "ddotseq;": "\u2a77",
+    "deg": "\xb0",
+    "deg;": "\xb0",
+    "delta;": "\u03b4",
+    "demptyv;": "\u29b1",
+    "dfisht;": "\u297f",
+    "dfr;": "\U0001d521",
+    "dharl;": "\u21c3",
+    "dharr;": "\u21c2",
+    "diam;": "\u22c4",
+    "diamond;": "\u22c4",
+    "diamondsuit;": "\u2666",
+    "diams;": "\u2666",
+    "die;": "\xa8",
+    "digamma;": "\u03dd",
+    "disin;": "\u22f2",
+    "div;": "\xf7",
+    "divide": "\xf7",
+    "divide;": "\xf7",
+    "divideontimes;": "\u22c7",
+    "divonx;": "\u22c7",
+    "djcy;": "\u0452",
+    "dlcorn;": "\u231e",
+    "dlcrop;": "\u230d",
+    "dollar;": "$",
+    "dopf;": "\U0001d555",
+    "dot;": "\u02d9",
+    "doteq;": "\u2250",
+    "doteqdot;": "\u2251",
+    "dotminus;": "\u2238",
+    "dotplus;": "\u2214",
+    "dotsquare;": "\u22a1",
+    "doublebarwedge;": "\u2306",
+    "downarrow;": "\u2193",
+    "downdownarrows;": "\u21ca",
+    "downharpoonleft;": "\u21c3",
+    "downharpoonright;": "\u21c2",
+    "drbkarow;": "\u2910",
+    "drcorn;": "\u231f",
+    "drcrop;": "\u230c",
+    "dscr;": "\U0001d4b9",
+    "dscy;": "\u0455",
+    "dsol;": "\u29f6",
+    "dstrok;": "\u0111",
+    "dtdot;": "\u22f1",
+    "dtri;": "\u25bf",
+    "dtrif;": "\u25be",
+    "duarr;": "\u21f5",
+    "duhar;": "\u296f",
+    "dwangle;": "\u29a6",
+    "dzcy;": "\u045f",
+    "dzigrarr;": "\u27ff",
+    "eDDot;": "\u2a77",
+    "eDot;": "\u2251",
+    "eacute": "\xe9",
+    "eacute;": "\xe9",
+    "easter;": "\u2a6e",
+    "ecaron;": "\u011b",
+    "ecir;": "\u2256",
+    "ecirc": "\xea",
+    "ecirc;": "\xea",
+    "ecolon;": "\u2255",
+    "ecy;": "\u044d",
+    "edot;": "\u0117",
+    "ee;": "\u2147",
+    "efDot;": "\u2252",
+    "efr;": "\U0001d522",
+    "eg;": "\u2a9a",
+    "egrave": "\xe8",
+    "egrave;": "\xe8",
+    "egs;": "\u2a96",
+    "egsdot;": "\u2a98",
+    "el;": "\u2a99",
+    "elinters;": "\u23e7",
+    "ell;": "\u2113",
+    "els;": "\u2a95",
+    "elsdot;": "\u2a97",
+    "emacr;": "\u0113",
+    "empty;": "\u2205",
+    "emptyset;": "\u2205",
+    "emptyv;": "\u2205",
+    "emsp13;": "\u2004",
+    "emsp14;": "\u2005",
+    "emsp;": "\u2003",
+    "eng;": "\u014b",
+    "ensp;": "\u2002",
+    "eogon;": "\u0119",
+    "eopf;": "\U0001d556",
+    "epar;": "\u22d5",
+    "eparsl;": "\u29e3",
+    "eplus;": "\u2a71",
+    "epsi;": "\u03b5",
+    "epsilon;": "\u03b5",
+    "epsiv;": "\u03f5",
+    "eqcirc;": "\u2256",
+    "eqcolon;": "\u2255",
+    "eqsim;": "\u2242",
+    "eqslantgtr;": "\u2a96",
+    "eqslantless;": "\u2a95",
+    "equals;": "=",
+    "equest;": "\u225f",
+    "equiv;": "\u2261",
+    "equivDD;": "\u2a78",
+    "eqvparsl;": "\u29e5",
+    "erDot;": "\u2253",
+    "erarr;": "\u2971",
+    "escr;": "\u212f",
+    "esdot;": "\u2250",
+    "esim;": "\u2242",
+    "eta;": "\u03b7",
+    "eth": "\xf0",
+    "eth;": "\xf0",
+    "euml": "\xeb",
+    "euml;": "\xeb",
+    "euro;": "\u20ac",
+    "excl;": "!",
+    "exist;": "\u2203",
+    "expectation;": "\u2130",
+    "exponentiale;": "\u2147",
+    "fallingdotseq;": "\u2252",
+    "fcy;": "\u0444",
+    "female;": "\u2640",
+    "ffilig;": "\ufb03",
+    "fflig;": "\ufb00",
+    "ffllig;": "\ufb04",
+    "ffr;": "\U0001d523",
+    "filig;": "\ufb01",
+    "fjlig;": "fj",
+    "flat;": "\u266d",
+    "fllig;": "\ufb02",
+    "fltns;": "\u25b1",
+    "fnof;": "\u0192",
+    "fopf;": "\U0001d557",
+    "forall;": "\u2200",
+    "fork;": "\u22d4",
+    "forkv;": "\u2ad9",
+    "fpartint;": "\u2a0d",
+    "frac12": "\xbd",
+    "frac12;": "\xbd",
+    "frac13;": "\u2153",
+    "frac14": "\xbc",
+    "frac14;": "\xbc",
+    "frac15;": "\u2155",
+    "frac16;": "\u2159",
+    "frac18;": "\u215b",
+    "frac23;": "\u2154",
+    "frac25;": "\u2156",
+    "frac34": "\xbe",
+    "frac34;": "\xbe",
+    "frac35;": "\u2157",
+    "frac38;": "\u215c",
+    "frac45;": "\u2158",
+    "frac56;": "\u215a",
+    "frac58;": "\u215d",
+    "frac78;": "\u215e",
+    "frasl;": "\u2044",
+    "frown;": "\u2322",
+    "fscr;": "\U0001d4bb",
+    "gE;": "\u2267",
+    "gEl;": "\u2a8c",
+    "gacute;": "\u01f5",
+    "gamma;": "\u03b3",
+    "gammad;": "\u03dd",
+    "gap;": "\u2a86",
+    "gbreve;": "\u011f",
+    "gcirc;": "\u011d",
+    "gcy;": "\u0433",
+    "gdot;": "\u0121",
+    "ge;": "\u2265",
+    "gel;": "\u22db",
+    "geq;": "\u2265",
+    "geqq;": "\u2267",
+    "geqslant;": "\u2a7e",
+    "ges;": "\u2a7e",
+    "gescc;": "\u2aa9",
+    "gesdot;": "\u2a80",
+    "gesdoto;": "\u2a82",
+    "gesdotol;": "\u2a84",
+    "gesl;": "\u22db\ufe00",
+    "gesles;": "\u2a94",
+    "gfr;": "\U0001d524",
+    "gg;": "\u226b",
+    "ggg;": "\u22d9",
+    "gimel;": "\u2137",
+    "gjcy;": "\u0453",
+    "gl;": "\u2277",
+    "glE;": "\u2a92",
+    "gla;": "\u2aa5",
+    "glj;": "\u2aa4",
+    "gnE;": "\u2269",
+    "gnap;": "\u2a8a",
+    "gnapprox;": "\u2a8a",
+    "gne;": "\u2a88",
+    "gneq;": "\u2a88",
+    "gneqq;": "\u2269",
+    "gnsim;": "\u22e7",
+    "gopf;": "\U0001d558",
+    "grave;": "`",
+    "gscr;": "\u210a",
+    "gsim;": "\u2273",
+    "gsime;": "\u2a8e",
+    "gsiml;": "\u2a90",
+    "gt": ">",
+    "gt;": ">",
+    "gtcc;": "\u2aa7",
+    "gtcir;": "\u2a7a",
+    "gtdot;": "\u22d7",
+    "gtlPar;": "\u2995",
+    "gtquest;": "\u2a7c",
+    "gtrapprox;": "\u2a86",
+    "gtrarr;": "\u2978",
+    "gtrdot;": "\u22d7",
+    "gtreqless;": "\u22db",
+    "gtreqqless;": "\u2a8c",
+    "gtrless;": "\u2277",
+    "gtrsim;": "\u2273",
+    "gvertneqq;": "\u2269\ufe00",
+    "gvnE;": "\u2269\ufe00",
+    "hArr;": "\u21d4",
+    "hairsp;": "\u200a",
+    "half;": "\xbd",
+    "hamilt;": "\u210b",
+    "hardcy;": "\u044a",
+    "harr;": "\u2194",
+    "harrcir;": "\u2948",
+    "harrw;": "\u21ad",
+    "hbar;": "\u210f",
+    "hcirc;": "\u0125",
+    "hearts;": "\u2665",
+    "heartsuit;": "\u2665",
+    "hellip;": "\u2026",
+    "hercon;": "\u22b9",
+    "hfr;": "\U0001d525",
+    "hksearow;": "\u2925",
+    "hkswarow;": "\u2926",
+    "hoarr;": "\u21ff",
+    "homtht;": "\u223b",
+    "hookleftarrow;": "\u21a9",
+    "hookrightarrow;": "\u21aa",
+    "hopf;": "\U0001d559",
+    "horbar;": "\u2015",
+    "hscr;": "\U0001d4bd",
+    "hslash;": "\u210f",
+    "hstrok;": "\u0127",
+    "hybull;": "\u2043",
+    "hyphen;": "\u2010",
+    "iacute": "\xed",
+    "iacute;": "\xed",
+    "ic;": "\u2063",
+    "icirc": "\xee",
+    "icirc;": "\xee",
+    "icy;": "\u0438",
+    "iecy;": "\u0435",
+    "iexcl": "\xa1",
+    "iexcl;": "\xa1",
+    "iff;": "\u21d4",
+    "ifr;": "\U0001d526",
+    "igrave": "\xec",
+    "igrave;": "\xec",
+    "ii;": "\u2148",
+    "iiiint;": "\u2a0c",
+    "iiint;": "\u222d",
+    "iinfin;": "\u29dc",
+    "iiota;": "\u2129",
+    "ijlig;": "\u0133",
+    "imacr;": "\u012b",
+    "image;": "\u2111",
+    "imagline;": "\u2110",
+    "imagpart;": "\u2111",
+    "imath;": "\u0131",
+    "imof;": "\u22b7",
+    "imped;": "\u01b5",
+    "in;": "\u2208",
+    "incare;": "\u2105",
+    "infin;": "\u221e",
+    "infintie;": "\u29dd",
+    "inodot;": "\u0131",
+    "int;": "\u222b",
+    "intcal;": "\u22ba",
+    "integers;": "\u2124",
+    "intercal;": "\u22ba",
+    "intlarhk;": "\u2a17",
+    "intprod;": "\u2a3c",
+    "iocy;": "\u0451",
+    "iogon;": "\u012f",
+    "iopf;": "\U0001d55a",
+    "iota;": "\u03b9",
+    "iprod;": "\u2a3c",
+    "iquest": "\xbf",
+    "iquest;": "\xbf",
+    "iscr;": "\U0001d4be",
+    "isin;": "\u2208",
+    "isinE;": "\u22f9",
+    "isindot;": "\u22f5",
+    "isins;": "\u22f4",
+    "isinsv;": "\u22f3",
+    "isinv;": "\u2208",
+    "it;": "\u2062",
+    "itilde;": "\u0129",
+    "iukcy;": "\u0456",
+    "iuml": "\xef",
+    "iuml;": "\xef",
+    "jcirc;": "\u0135",
+    "jcy;": "\u0439",
+    "jfr;": "\U0001d527",
+    "jmath;": "\u0237",
+    "jopf;": "\U0001d55b",
+    "jscr;": "\U0001d4bf",
+    "jsercy;": "\u0458",
+    "jukcy;": "\u0454",
+    "kappa;": "\u03ba",
+    "kappav;": "\u03f0",
+    "kcedil;": "\u0137",
+    "kcy;": "\u043a",
+    "kfr;": "\U0001d528",
+    "kgreen;": "\u0138",
+    "khcy;": "\u0445",
+    "kjcy;": "\u045c",
+    "kopf;": "\U0001d55c",
+    "kscr;": "\U0001d4c0",
+    "lAarr;": "\u21da",
+    "lArr;": "\u21d0",
+    "lAtail;": "\u291b",
+    "lBarr;": "\u290e",
+    "lE;": "\u2266",
+    "lEg;": "\u2a8b",
+    "lHar;": "\u2962",
+    "lacute;": "\u013a",
+    "laemptyv;": "\u29b4",
+    "lagran;": "\u2112",
+    "lambda;": "\u03bb",
+    "lang;": "\u27e8",
+    "langd;": "\u2991",
+    "langle;": "\u27e8",
+    "lap;": "\u2a85",
+    "laquo": "\xab",
+    "laquo;": "\xab",
+    "larr;": "\u2190",
+    "larrb;": "\u21e4",
+    "larrbfs;": "\u291f",
+    "larrfs;": "\u291d",
+    "larrhk;": "\u21a9",
+    "larrlp;": "\u21ab",
+    "larrpl;": "\u2939",
+    "larrsim;": "\u2973",
+    "larrtl;": "\u21a2",
+    "lat;": "\u2aab",
+    "latail;": "\u2919",
+    "late;": "\u2aad",
+    "lates;": "\u2aad\ufe00",
+    "lbarr;": "\u290c",
+    "lbbrk;": "\u2772",
+    "lbrace;": "{",
+    "lbrack;": "[",
+    "lbrke;": "\u298b",
+    "lbrksld;": "\u298f",
+    "lbrkslu;": "\u298d",
+    "lcaron;": "\u013e",
+    "lcedil;": "\u013c",
+    "lceil;": "\u2308",
+    "lcub;": "{",
+    "lcy;": "\u043b",
+    "ldca;": "\u2936",
+    "ldquo;": "\u201c",
+    "ldquor;": "\u201e",
+    "ldrdhar;": "\u2967",
+    "ldrushar;": "\u294b",
+    "ldsh;": "\u21b2",
+    "le;": "\u2264",
+    "leftarrow;": "\u2190",
+    "leftarrowtail;": "\u21a2",
+    "leftharpoondown;": "\u21bd",
+    "leftharpoonup;": "\u21bc",
+    "leftleftarrows;": "\u21c7",
+    "leftrightarrow;": "\u2194",
+    "leftrightarrows;": "\u21c6",
+    "leftrightharpoons;": "\u21cb",
+    "leftrightsquigarrow;": "\u21ad",
+    "leftthreetimes;": "\u22cb",
+    "leg;": "\u22da",
+    "leq;": "\u2264",
+    "leqq;": "\u2266",
+    "leqslant;": "\u2a7d",
+    "les;": "\u2a7d",
+    "lescc;": "\u2aa8",
+    "lesdot;": "\u2a7f",
+    "lesdoto;": "\u2a81",
+    "lesdotor;": "\u2a83",
+    "lesg;": "\u22da\ufe00",
+    "lesges;": "\u2a93",
+    "lessapprox;": "\u2a85",
+    "lessdot;": "\u22d6",
+    "lesseqgtr;": "\u22da",
+    "lesseqqgtr;": "\u2a8b",
+    "lessgtr;": "\u2276",
+    "lesssim;": "\u2272",
+    "lfisht;": "\u297c",
+    "lfloor;": "\u230a",
+    "lfr;": "\U0001d529",
+    "lg;": "\u2276",
+    "lgE;": "\u2a91",
+    "lhard;": "\u21bd",
+    "lharu;": "\u21bc",
+    "lharul;": "\u296a",
+    "lhblk;": "\u2584",
+    "ljcy;": "\u0459",
+    "ll;": "\u226a",
+    "llarr;": "\u21c7",
+    "llcorner;": "\u231e",
+    "llhard;": "\u296b",
+    "lltri;": "\u25fa",
+    "lmidot;": "\u0140",
+    "lmoust;": "\u23b0",
+    "lmoustache;": "\u23b0",
+    "lnE;": "\u2268",
+    "lnap;": "\u2a89",
+    "lnapprox;": "\u2a89",
+    "lne;": "\u2a87",
+    "lneq;": "\u2a87",
+    "lneqq;": "\u2268",
+    "lnsim;": "\u22e6",
+    "loang;": "\u27ec",
+    "loarr;": "\u21fd",
+    "lobrk;": "\u27e6",
+    "longleftarrow;": "\u27f5",
+    "longleftrightarrow;": "\u27f7",
+    "longmapsto;": "\u27fc",
+    "longrightarrow;": "\u27f6",
+    "looparrowleft;": "\u21ab",
+    "looparrowright;": "\u21ac",
+    "lopar;": "\u2985",
+    "lopf;": "\U0001d55d",
+    "loplus;": "\u2a2d",
+    "lotimes;": "\u2a34",
+    "lowast;": "\u2217",
+    "lowbar;": "_",
+    "loz;": "\u25ca",
+    "lozenge;": "\u25ca",
+    "lozf;": "\u29eb",
+    "lpar;": "(",
+    "lparlt;": "\u2993",
+    "lrarr;": "\u21c6",
+    "lrcorner;": "\u231f",
+    "lrhar;": "\u21cb",
+    "lrhard;": "\u296d",
+    "lrm;": "\u200e",
+    "lrtri;": "\u22bf",
+    "lsaquo;": "\u2039",
+    "lscr;": "\U0001d4c1",
+    "lsh;": "\u21b0",
+    "lsim;": "\u2272",
+    "lsime;": "\u2a8d",
+    "lsimg;": "\u2a8f",
+    "lsqb;": "[",
+    "lsquo;": "\u2018",
+    "lsquor;": "\u201a",
+    "lstrok;": "\u0142",
+    "lt": "<",
+    "lt;": "<",
+    "ltcc;": "\u2aa6",
+    "ltcir;": "\u2a79",
+    "ltdot;": "\u22d6",
+    "lthree;": "\u22cb",
+    "ltimes;": "\u22c9",
+    "ltlarr;": "\u2976",
+    "ltquest;": "\u2a7b",
+    "ltrPar;": "\u2996",
+    "ltri;": "\u25c3",
+    "ltrie;": "\u22b4",
+    "ltrif;": "\u25c2",
+    "lurdshar;": "\u294a",
+    "luruhar;": "\u2966",
+    "lvertneqq;": "\u2268\ufe00",
+    "lvnE;": "\u2268\ufe00",
+    "mDDot;": "\u223a",
+    "macr": "\xaf",
+    "macr;": "\xaf",
+    "male;": "\u2642",
+    "malt;": "\u2720",
+    "maltese;": "\u2720",
+    "map;": "\u21a6",
+    "mapsto;": "\u21a6",
+    "mapstodown;": "\u21a7",
+    "mapstoleft;": "\u21a4",
+    "mapstoup;": "\u21a5",
+    "marker;": "\u25ae",
+    "mcomma;": "\u2a29",
+    "mcy;": "\u043c",
+    "mdash;": "\u2014",
+    "measuredangle;": "\u2221",
+    "mfr;": "\U0001d52a",
+    "mho;": "\u2127",
+    "micro": "\xb5",
+    "micro;": "\xb5",
+    "mid;": "\u2223",
+    "midast;": "*",
+    "midcir;": "\u2af0",
+    "middot": "\xb7",
+    "middot;": "\xb7",
+    "minus;": "\u2212",
+    "minusb;": "\u229f",
+    "minusd;": "\u2238",
+    "minusdu;": "\u2a2a",
+    "mlcp;": "\u2adb",
+    "mldr;": "\u2026",
+    "mnplus;": "\u2213",
+    "models;": "\u22a7",
+    "mopf;": "\U0001d55e",
+    "mp;": "\u2213",
+    "mscr;": "\U0001d4c2",
+    "mstpos;": "\u223e",
+    "mu;": "\u03bc",
+    "multimap;": "\u22b8",
+    "mumap;": "\u22b8",
+    "nGg;": "\u22d9\u0338",
+    "nGt;": "\u226b\u20d2",
+    "nGtv;": "\u226b\u0338",
+    "nLeftarrow;": "\u21cd",
+    "nLeftrightarrow;": "\u21ce",
+    "nLl;": "\u22d8\u0338",
+    "nLt;": "\u226a\u20d2",
+    "nLtv;": "\u226a\u0338",
+    "nRightarrow;": "\u21cf",
+    "nVDash;": "\u22af",
+    "nVdash;": "\u22ae",
+    "nabla;": "\u2207",
+    "nacute;": "\u0144",
+    "nang;": "\u2220\u20d2",
+    "nap;": "\u2249",
+    "napE;": "\u2a70\u0338",
+    "napid;": "\u224b\u0338",
+    "napos;": "\u0149",
+    "napprox;": "\u2249",
+    "natur;": "\u266e",
+    "natural;": "\u266e",
+    "naturals;": "\u2115",
+    "nbsp": "\xa0",
+    "nbsp;": "\xa0",
+    "nbump;": "\u224e\u0338",
+    "nbumpe;": "\u224f\u0338",
+    "ncap;": "\u2a43",
+    "ncaron;": "\u0148",
+    "ncedil;": "\u0146",
+    "ncong;": "\u2247",
+    "ncongdot;": "\u2a6d\u0338",
+    "ncup;": "\u2a42",
+    "ncy;": "\u043d",
+    "ndash;": "\u2013",
+    "ne;": "\u2260",
+    "neArr;": "\u21d7",
+    "nearhk;": "\u2924",
+    "nearr;": "\u2197",
+    "nearrow;": "\u2197",
+    "nedot;": "\u2250\u0338",
+    "nequiv;": "\u2262",
+    "nesear;": "\u2928",
+    "nesim;": "\u2242\u0338",
+    "nexist;": "\u2204",
+    "nexists;": "\u2204",
+    "nfr;": "\U0001d52b",
+    "ngE;": "\u2267\u0338",
+    "nge;": "\u2271",
+    "ngeq;": "\u2271",
+    "ngeqq;": "\u2267\u0338",
+    "ngeqslant;": "\u2a7e\u0338",
+    "nges;": "\u2a7e\u0338",
+    "ngsim;": "\u2275",
+    "ngt;": "\u226f",
+    "ngtr;": "\u226f",
+    "nhArr;": "\u21ce",
+    "nharr;": "\u21ae",
+    "nhpar;": "\u2af2",
+    "ni;": "\u220b",
+    "nis;": "\u22fc",
+    "nisd;": "\u22fa",
+    "niv;": "\u220b",
+    "njcy;": "\u045a",
+    "nlArr;": "\u21cd",
+    "nlE;": "\u2266\u0338",
+    "nlarr;": "\u219a",
+    "nldr;": "\u2025",
+    "nle;": "\u2270",
+    "nleftarrow;": "\u219a",
+    "nleftrightarrow;": "\u21ae",
+    "nleq;": "\u2270",
+    "nleqq;": "\u2266\u0338",
+    "nleqslant;": "\u2a7d\u0338",
+    "nles;": "\u2a7d\u0338",
+    "nless;": "\u226e",
+    "nlsim;": "\u2274",
+    "nlt;": "\u226e",
+    "nltri;": "\u22ea",
+    "nltrie;": "\u22ec",
+    "nmid;": "\u2224",
+    "nopf;": "\U0001d55f",
+    "not": "\xac",
+    "not;": "\xac",
+    "notin;": "\u2209",
+    "notinE;": "\u22f9\u0338",
+    "notindot;": "\u22f5\u0338",
+    "notinva;": "\u2209",
+    "notinvb;": "\u22f7",
+    "notinvc;": "\u22f6",
+    "notni;": "\u220c",
+    "notniva;": "\u220c",
+    "notnivb;": "\u22fe",
+    "notnivc;": "\u22fd",
+    "npar;": "\u2226",
+    "nparallel;": "\u2226",
+    "nparsl;": "\u2afd\u20e5",
+    "npart;": "\u2202\u0338",
+    "npolint;": "\u2a14",
+    "npr;": "\u2280",
+    "nprcue;": "\u22e0",
+    "npre;": "\u2aaf\u0338",
+    "nprec;": "\u2280",
+    "npreceq;": "\u2aaf\u0338",
+    "nrArr;": "\u21cf",
+    "nrarr;": "\u219b",
+    "nrarrc;": "\u2933\u0338",
+    "nrarrw;": "\u219d\u0338",
+    "nrightarrow;": "\u219b",
+    "nrtri;": "\u22eb",
+    "nrtrie;": "\u22ed",
+    "nsc;": "\u2281",
+    "nsccue;": "\u22e1",
+    "nsce;": "\u2ab0\u0338",
+    "nscr;": "\U0001d4c3",
+    "nshortmid;": "\u2224",
+    "nshortparallel;": "\u2226",
+    "nsim;": "\u2241",
+    "nsime;": "\u2244",
+    "nsimeq;": "\u2244",
+    "nsmid;": "\u2224",
+    "nspar;": "\u2226",
+    "nsqsube;": "\u22e2",
+    "nsqsupe;": "\u22e3",
+    "nsub;": "\u2284",
+    "nsubE;": "\u2ac5\u0338",
+    "nsube;": "\u2288",
+    "nsubset;": "\u2282\u20d2",
+    "nsubseteq;": "\u2288",
+    "nsubseteqq;": "\u2ac5\u0338",
+    "nsucc;": "\u2281",
+    "nsucceq;": "\u2ab0\u0338",
+    "nsup;": "\u2285",
+    "nsupE;": "\u2ac6\u0338",
+    "nsupe;": "\u2289",
+    "nsupset;": "\u2283\u20d2",
+    "nsupseteq;": "\u2289",
+    "nsupseteqq;": "\u2ac6\u0338",
+    "ntgl;": "\u2279",
+    "ntilde": "\xf1",
+    "ntilde;": "\xf1",
+    "ntlg;": "\u2278",
+    "ntriangleleft;": "\u22ea",
+    "ntrianglelefteq;": "\u22ec",
+    "ntriangleright;": "\u22eb",
+    "ntrianglerighteq;": "\u22ed",
+    "nu;": "\u03bd",
+    "num;": "#",
+    "numero;": "\u2116",
+    "numsp;": "\u2007",
+    "nvDash;": "\u22ad",
+    "nvHarr;": "\u2904",
+    "nvap;": "\u224d\u20d2",
+    "nvdash;": "\u22ac",
+    "nvge;": "\u2265\u20d2",
+    "nvgt;": ">\u20d2",
+    "nvinfin;": "\u29de",
+    "nvlArr;": "\u2902",
+    "nvle;": "\u2264\u20d2",
+    "nvlt;": "<\u20d2",
+    "nvltrie;": "\u22b4\u20d2",
+    "nvrArr;": "\u2903",
+    "nvrtrie;": "\u22b5\u20d2",
+    "nvsim;": "\u223c\u20d2",
+    "nwArr;": "\u21d6",
+    "nwarhk;": "\u2923",
+    "nwarr;": "\u2196",
+    "nwarrow;": "\u2196",
+    "nwnear;": "\u2927",
+    "oS;": "\u24c8",
+    "oacute": "\xf3",
+    "oacute;": "\xf3",
+    "oast;": "\u229b",
+    "ocir;": "\u229a",
+    "ocirc": "\xf4",
+    "ocirc;": "\xf4",
+    "ocy;": "\u043e",
+    "odash;": "\u229d",
+    "odblac;": "\u0151",
+    "odiv;": "\u2a38",
+    "odot;": "\u2299",
+    "odsold;": "\u29bc",
+    "oelig;": "\u0153",
+    "ofcir;": "\u29bf",
+    "ofr;": "\U0001d52c",
+    "ogon;": "\u02db",
+    "ograve": "\xf2",
+    "ograve;": "\xf2",
+    "ogt;": "\u29c1",
+    "ohbar;": "\u29b5",
+    "ohm;": "\u03a9",
+    "oint;": "\u222e",
+    "olarr;": "\u21ba",
+    "olcir;": "\u29be",
+    "olcross;": "\u29bb",
+    "oline;": "\u203e",
+    "olt;": "\u29c0",
+    "omacr;": "\u014d",
+    "omega;": "\u03c9",
+    "omicron;": "\u03bf",
+    "omid;": "\u29b6",
+    "ominus;": "\u2296",
+    "oopf;": "\U0001d560",
+    "opar;": "\u29b7",
+    "operp;": "\u29b9",
+    "oplus;": "\u2295",
+    "or;": "\u2228",
+    "orarr;": "\u21bb",
+    "ord;": "\u2a5d",
+    "order;": "\u2134",
+    "orderof;": "\u2134",
+    "ordf": "\xaa",
+    "ordf;": "\xaa",
+    "ordm": "\xba",
+    "ordm;": "\xba",
+    "origof;": "\u22b6",
+    "oror;": "\u2a56",
+    "orslope;": "\u2a57",
+    "orv;": "\u2a5b",
+    "oscr;": "\u2134",
+    "oslash": "\xf8",
+    "oslash;": "\xf8",
+    "osol;": "\u2298",
+    "otilde": "\xf5",
+    "otilde;": "\xf5",
+    "otimes;": "\u2297",
+    "otimesas;": "\u2a36",
+    "ouml": "\xf6",
+    "ouml;": "\xf6",
+    "ovbar;": "\u233d",
+    "par;": "\u2225",
+    "para": "\xb6",
+    "para;": "\xb6",
+    "parallel;": "\u2225",
+    "parsim;": "\u2af3",
+    "parsl;": "\u2afd",
+    "part;": "\u2202",
+    "pcy;": "\u043f",
+    "percnt;": "%",
+    "period;": ".",
+    "permil;": "\u2030",
+    "perp;": "\u22a5",
+    "pertenk;": "\u2031",
+    "pfr;": "\U0001d52d",
+    "phi;": "\u03c6",
+    "phiv;": "\u03d5",
+    "phmmat;": "\u2133",
+    "phone;": "\u260e",
+    "pi;": "\u03c0",
+    "pitchfork;": "\u22d4",
+    "piv;": "\u03d6",
+    "planck;": "\u210f",
+    "planckh;": "\u210e",
+    "plankv;": "\u210f",
+    "plus;": "+",
+    "plusacir;": "\u2a23",
+    "plusb;": "\u229e",
+    "pluscir;": "\u2a22",
+    "plusdo;": "\u2214",
+    "plusdu;": "\u2a25",
+    "pluse;": "\u2a72",
+    "plusmn": "\xb1",
+    "plusmn;": "\xb1",
+    "plussim;": "\u2a26",
+    "plustwo;": "\u2a27",
+    "pm;": "\xb1",
+    "pointint;": "\u2a15",
+    "popf;": "\U0001d561",
+    "pound": "\xa3",
+    "pound;": "\xa3",
+    "pr;": "\u227a",
+    "prE;": "\u2ab3",
+    "prap;": "\u2ab7",
+    "prcue;": "\u227c",
+    "pre;": "\u2aaf",
+    "prec;": "\u227a",
+    "precapprox;": "\u2ab7",
+    "preccurlyeq;": "\u227c",
+    "preceq;": "\u2aaf",
+    "precnapprox;": "\u2ab9",
+    "precneqq;": "\u2ab5",
+    "precnsim;": "\u22e8",
+    "precsim;": "\u227e",
+    "prime;": "\u2032",
+    "primes;": "\u2119",
+    "prnE;": "\u2ab5",
+    "prnap;": "\u2ab9",
+    "prnsim;": "\u22e8",
+    "prod;": "\u220f",
+    "profalar;": "\u232e",
+    "profline;": "\u2312",
+    "profsurf;": "\u2313",
+    "prop;": "\u221d",
+    "propto;": "\u221d",
+    "prsim;": "\u227e",
+    "prurel;": "\u22b0",
+    "pscr;": "\U0001d4c5",
+    "psi;": "\u03c8",
+    "puncsp;": "\u2008",
+    "qfr;": "\U0001d52e",
+    "qint;": "\u2a0c",
+    "qopf;": "\U0001d562",
+    "qprime;": "\u2057",
+    "qscr;": "\U0001d4c6",
+    "quaternions;": "\u210d",
+    "quatint;": "\u2a16",
+    "quest;": "?",
+    "questeq;": "\u225f",
+    "quot": "\"",
+    "quot;": "\"",
+    "rAarr;": "\u21db",
+    "rArr;": "\u21d2",
+    "rAtail;": "\u291c",
+    "rBarr;": "\u290f",
+    "rHar;": "\u2964",
+    "race;": "\u223d\u0331",
+    "racute;": "\u0155",
+    "radic;": "\u221a",
+    "raemptyv;": "\u29b3",
+    "rang;": "\u27e9",
+    "rangd;": "\u2992",
+    "range;": "\u29a5",
+    "rangle;": "\u27e9",
+    "raquo": "\xbb",
+    "raquo;": "\xbb",
+    "rarr;": "\u2192",
+    "rarrap;": "\u2975",
+    "rarrb;": "\u21e5",
+    "rarrbfs;": "\u2920",
+    "rarrc;": "\u2933",
+    "rarrfs;": "\u291e",
+    "rarrhk;": "\u21aa",
+    "rarrlp;": "\u21ac",
+    "rarrpl;": "\u2945",
+    "rarrsim;": "\u2974",
+    "rarrtl;": "\u21a3",
+    "rarrw;": "\u219d",
+    "ratail;": "\u291a",
+    "ratio;": "\u2236",
+    "rationals;": "\u211a",
+    "rbarr;": "\u290d",
+    "rbbrk;": "\u2773",
+    "rbrace;": "}",
+    "rbrack;": "]",
+    "rbrke;": "\u298c",
+    "rbrksld;": "\u298e",
+    "rbrkslu;": "\u2990",
+    "rcaron;": "\u0159",
+    "rcedil;": "\u0157",
+    "rceil;": "\u2309",
+    "rcub;": "}",
+    "rcy;": "\u0440",
+    "rdca;": "\u2937",
+    "rdldhar;": "\u2969",
+    "rdquo;": "\u201d",
+    "rdquor;": "\u201d",
+    "rdsh;": "\u21b3",
+    "real;": "\u211c",
+    "realine;": "\u211b",
+    "realpart;": "\u211c",
+    "reals;": "\u211d",
+    "rect;": "\u25ad",
+    "reg": "\xae",
+    "reg;": "\xae",
+    "rfisht;": "\u297d",
+    "rfloor;": "\u230b",
+    "rfr;": "\U0001d52f",
+    "rhard;": "\u21c1",
+    "rharu;": "\u21c0",
+    "rharul;": "\u296c",
+    "rho;": "\u03c1",
+    "rhov;": "\u03f1",
+    "rightarrow;": "\u2192",
+    "rightarrowtail;": "\u21a3",
+    "rightharpoondown;": "\u21c1",
+    "rightharpoonup;": "\u21c0",
+    "rightleftarrows;": "\u21c4",
+    "rightleftharpoons;": "\u21cc",
+    "rightrightarrows;": "\u21c9",
+    "rightsquigarrow;": "\u219d",
+    "rightthreetimes;": "\u22cc",
+    "ring;": "\u02da",
+    "risingdotseq;": "\u2253",
+    "rlarr;": "\u21c4",
+    "rlhar;": "\u21cc",
+    "rlm;": "\u200f",
+    "rmoust;": "\u23b1",
+    "rmoustache;": "\u23b1",
+    "rnmid;": "\u2aee",
+    "roang;": "\u27ed",
+    "roarr;": "\u21fe",
+    "robrk;": "\u27e7",
+    "ropar;": "\u2986",
+    "ropf;": "\U0001d563",
+    "roplus;": "\u2a2e",
+    "rotimes;": "\u2a35",
+    "rpar;": ")",
+    "rpargt;": "\u2994",
+    "rppolint;": "\u2a12",
+    "rrarr;": "\u21c9",
+    "rsaquo;": "\u203a",
+    "rscr;": "\U0001d4c7",
+    "rsh;": "\u21b1",
+    "rsqb;": "]",
+    "rsquo;": "\u2019",
+    "rsquor;": "\u2019",
+    "rthree;": "\u22cc",
+    "rtimes;": "\u22ca",
+    "rtri;": "\u25b9",
+    "rtrie;": "\u22b5",
+    "rtrif;": "\u25b8",
+    "rtriltri;": "\u29ce",
+    "ruluhar;": "\u2968",
+    "rx;": "\u211e",
+    "sacute;": "\u015b",
+    "sbquo;": "\u201a",
+    "sc;": "\u227b",
+    "scE;": "\u2ab4",
+    "scap;": "\u2ab8",
+    "scaron;": "\u0161",
+    "sccue;": "\u227d",
+    "sce;": "\u2ab0",
+    "scedil;": "\u015f",
+    "scirc;": "\u015d",
+    "scnE;": "\u2ab6",
+    "scnap;": "\u2aba",
+    "scnsim;": "\u22e9",
+    "scpolint;": "\u2a13",
+    "scsim;": "\u227f",
+    "scy;": "\u0441",
+    "sdot;": "\u22c5",
+    "sdotb;": "\u22a1",
+    "sdote;": "\u2a66",
+    "seArr;": "\u21d8",
+    "searhk;": "\u2925",
+    "searr;": "\u2198",
+    "searrow;": "\u2198",
+    "sect": "\xa7",
+    "sect;": "\xa7",
+    "semi;": ";",
+    "seswar;": "\u2929",
+    "setminus;": "\u2216",
+    "setmn;": "\u2216",
+    "sext;": "\u2736",
+    "sfr;": "\U0001d530",
+    "sfrown;": "\u2322",
+    "sharp;": "\u266f",
+    "shchcy;": "\u0449",
+    "shcy;": "\u0448",
+    "shortmid;": "\u2223",
+    "shortparallel;": "\u2225",
+    "shy": "\xad",
+    "shy;": "\xad",
+    "sigma;": "\u03c3",
+    "sigmaf;": "\u03c2",
+    "sigmav;": "\u03c2",
+    "sim;": "\u223c",
+    "simdot;": "\u2a6a",
+    "sime;": "\u2243",
+    "simeq;": "\u2243",
+    "simg;": "\u2a9e",
+    "simgE;": "\u2aa0",
+    "siml;": "\u2a9d",
+    "simlE;": "\u2a9f",
+    "simne;": "\u2246",
+    "simplus;": "\u2a24",
+    "simrarr;": "\u2972",
+    "slarr;": "\u2190",
+    "smallsetminus;": "\u2216",
+    "smashp;": "\u2a33",
+    "smeparsl;": "\u29e4",
+    "smid;": "\u2223",
+    "smile;": "\u2323",
+    "smt;": "\u2aaa",
+    "smte;": "\u2aac",
+    "smtes;": "\u2aac\ufe00",
+    "softcy;": "\u044c",
+    "sol;": "/",
+    "solb;": "\u29c4",
+    "solbar;": "\u233f",
+    "sopf;": "\U0001d564",
+    "spades;": "\u2660",
+    "spadesuit;": "\u2660",
+    "spar;": "\u2225",
+    "sqcap;": "\u2293",
+    "sqcaps;": "\u2293\ufe00",
+    "sqcup;": "\u2294",
+    "sqcups;": "\u2294\ufe00",
+    "sqsub;": "\u228f",
+    "sqsube;": "\u2291",
+    "sqsubset;": "\u228f",
+    "sqsubseteq;": "\u2291",
+    "sqsup;": "\u2290",
+    "sqsupe;": "\u2292",
+    "sqsupset;": "\u2290",
+    "sqsupseteq;": "\u2292",
+    "squ;": "\u25a1",
+    "square;": "\u25a1",
+    "squarf;": "\u25aa",
+    "squf;": "\u25aa",
+    "srarr;": "\u2192",
+    "sscr;": "\U0001d4c8",
+    "ssetmn;": "\u2216",
+    "ssmile;": "\u2323",
+    "sstarf;": "\u22c6",
+    "star;": "\u2606",
+    "starf;": "\u2605",
+    "straightepsilon;": "\u03f5",
+    "straightphi;": "\u03d5",
+    "strns;": "\xaf",
+    "sub;": "\u2282",
+    "subE;": "\u2ac5",
+    "subdot;": "\u2abd",
+    "sube;": "\u2286",
+    "subedot;": "\u2ac3",
+    "submult;": "\u2ac1",
+    "subnE;": "\u2acb",
+    "subne;": "\u228a",
+    "subplus;": "\u2abf",
+    "subrarr;": "\u2979",
+    "subset;": "\u2282",
+    "subseteq;": "\u2286",
+    "subseteqq;": "\u2ac5",
+    "subsetneq;": "\u228a",
+    "subsetneqq;": "\u2acb",
+    "subsim;": "\u2ac7",
+    "subsub;": "\u2ad5",
+    "subsup;": "\u2ad3",
+    "succ;": "\u227b",
+    "succapprox;": "\u2ab8",
+    "succcurlyeq;": "\u227d",
+    "succeq;": "\u2ab0",
+    "succnapprox;": "\u2aba",
+    "succneqq;": "\u2ab6",
+    "succnsim;": "\u22e9",
+    "succsim;": "\u227f",
+    "sum;": "\u2211",
+    "sung;": "\u266a",
+    "sup1": "\xb9",
+    "sup1;": "\xb9",
+    "sup2": "\xb2",
+    "sup2;": "\xb2",
+    "sup3": "\xb3",
+    "sup3;": "\xb3",
+    "sup;": "\u2283",
+    "supE;": "\u2ac6",
+    "supdot;": "\u2abe",
+    "supdsub;": "\u2ad8",
+    "supe;": "\u2287",
+    "supedot;": "\u2ac4",
+    "suphsol;": "\u27c9",
+    "suphsub;": "\u2ad7",
+    "suplarr;": "\u297b",
+    "supmult;": "\u2ac2",
+    "supnE;": "\u2acc",
+    "supne;": "\u228b",
+    "supplus;": "\u2ac0",
+    "supset;": "\u2283",
+    "supseteq;": "\u2287",
+    "supseteqq;": "\u2ac6",
+    "supsetneq;": "\u228b",
+    "supsetneqq;": "\u2acc",
+    "supsim;": "\u2ac8",
+    "supsub;": "\u2ad4",
+    "supsup;": "\u2ad6",
+    "swArr;": "\u21d9",
+    "swarhk;": "\u2926",
+    "swarr;": "\u2199",
+    "swarrow;": "\u2199",
+    "swnwar;": "\u292a",
+    "szlig": "\xdf",
+    "szlig;": "\xdf",
+    "target;": "\u2316",
+    "tau;": "\u03c4",
+    "tbrk;": "\u23b4",
+    "tcaron;": "\u0165",
+    "tcedil;": "\u0163",
+    "tcy;": "\u0442",
+    "tdot;": "\u20db",
+    "telrec;": "\u2315",
+    "tfr;": "\U0001d531",
+    "there4;": "\u2234",
+    "therefore;": "\u2234",
+    "theta;": "\u03b8",
+    "thetasym;": "\u03d1",
+    "thetav;": "\u03d1",
+    "thickapprox;": "\u2248",
+    "thicksim;": "\u223c",
+    "thinsp;": "\u2009",
+    "thkap;": "\u2248",
+    "thksim;": "\u223c",
+    "thorn": "\xfe",
+    "thorn;": "\xfe",
+    "tilde;": "\u02dc",
+    "times": "\xd7",
+    "times;": "\xd7",
+    "timesb;": "\u22a0",
+    "timesbar;": "\u2a31",
+    "timesd;": "\u2a30",
+    "tint;": "\u222d",
+    "toea;": "\u2928",
+    "top;": "\u22a4",
+    "topbot;": "\u2336",
+    "topcir;": "\u2af1",
+    "topf;": "\U0001d565",
+    "topfork;": "\u2ada",
+    "tosa;": "\u2929",
+    "tprime;": "\u2034",
+    "trade;": "\u2122",
+    "triangle;": "\u25b5",
+    "triangledown;": "\u25bf",
+    "triangleleft;": "\u25c3",
+    "trianglelefteq;": "\u22b4",
+    "triangleq;": "\u225c",
+    "triangleright;": "\u25b9",
+    "trianglerighteq;": "\u22b5",
+    "tridot;": "\u25ec",
+    "trie;": "\u225c",
+    "triminus;": "\u2a3a",
+    "triplus;": "\u2a39",
+    "trisb;": "\u29cd",
+    "tritime;": "\u2a3b",
+    "trpezium;": "\u23e2",
+    "tscr;": "\U0001d4c9",
+    "tscy;": "\u0446",
+    "tshcy;": "\u045b",
+    "tstrok;": "\u0167",
+    "twixt;": "\u226c",
+    "twoheadleftarrow;": "\u219e",
+    "twoheadrightarrow;": "\u21a0",
+    "uArr;": "\u21d1",
+    "uHar;": "\u2963",
+    "uacute": "\xfa",
+    "uacute;": "\xfa",
+    "uarr;": "\u2191",
+    "ubrcy;": "\u045e",
+    "ubreve;": "\u016d",
+    "ucirc": "\xfb",
+    "ucirc;": "\xfb",
+    "ucy;": "\u0443",
+    "udarr;": "\u21c5",
+    "udblac;": "\u0171",
+    "udhar;": "\u296e",
+    "ufisht;": "\u297e",
+    "ufr;": "\U0001d532",
+    "ugrave": "\xf9",
+    "ugrave;": "\xf9",
+    "uharl;": "\u21bf",
+    "uharr;": "\u21be",
+    "uhblk;": "\u2580",
+    "ulcorn;": "\u231c",
+    "ulcorner;": "\u231c",
+    "ulcrop;": "\u230f",
+    "ultri;": "\u25f8",
+    "umacr;": "\u016b",
+    "uml": "\xa8",
+    "uml;": "\xa8",
+    "uogon;": "\u0173",
+    "uopf;": "\U0001d566",
+    "uparrow;": "\u2191",
+    "updownarrow;": "\u2195",
+    "upharpoonleft;": "\u21bf",
+    "upharpoonright;": "\u21be",
+    "uplus;": "\u228e",
+    "upsi;": "\u03c5",
+    "upsih;": "\u03d2",
+    "upsilon;": "\u03c5",
+    "upuparrows;": "\u21c8",
+    "urcorn;": "\u231d",
+    "urcorner;": "\u231d",
+    "urcrop;": "\u230e",
+    "uring;": "\u016f",
+    "urtri;": "\u25f9",
+    "uscr;": "\U0001d4ca",
+    "utdot;": "\u22f0",
+    "utilde;": "\u0169",
+    "utri;": "\u25b5",
+    "utrif;": "\u25b4",
+    "uuarr;": "\u21c8",
+    "uuml": "\xfc",
+    "uuml;": "\xfc",
+    "uwangle;": "\u29a7",
+    "vArr;": "\u21d5",
+    "vBar;": "\u2ae8",
+    "vBarv;": "\u2ae9",
+    "vDash;": "\u22a8",
+    "vangrt;": "\u299c",
+    "varepsilon;": "\u03f5",
+    "varkappa;": "\u03f0",
+    "varnothing;": "\u2205",
+    "varphi;": "\u03d5",
+    "varpi;": "\u03d6",
+    "varpropto;": "\u221d",
+    "varr;": "\u2195",
+    "varrho;": "\u03f1",
+    "varsigma;": "\u03c2",
+    "varsubsetneq;": "\u228a\ufe00",
+    "varsubsetneqq;": "\u2acb\ufe00",
+    "varsupsetneq;": "\u228b\ufe00",
+    "varsupsetneqq;": "\u2acc\ufe00",
+    "vartheta;": "\u03d1",
+    "vartriangleleft;": "\u22b2",
+    "vartriangleright;": "\u22b3",
+    "vcy;": "\u0432",
+    "vdash;": "\u22a2",
+    "vee;": "\u2228",
+    "veebar;": "\u22bb",
+    "veeeq;": "\u225a",
+    "vellip;": "\u22ee",
+    "verbar;": "|",
+    "vert;": "|",
+    "vfr;": "\U0001d533",
+    "vltri;": "\u22b2",
+    "vnsub;": "\u2282\u20d2",
+    "vnsup;": "\u2283\u20d2",
+    "vopf;": "\U0001d567",
+    "vprop;": "\u221d",
+    "vrtri;": "\u22b3",
+    "vscr;": "\U0001d4cb",
+    "vsubnE;": "\u2acb\ufe00",
+    "vsubne;": "\u228a\ufe00",
+    "vsupnE;": "\u2acc\ufe00",
+    "vsupne;": "\u228b\ufe00",
+    "vzigzag;": "\u299a",
+    "wcirc;": "\u0175",
+    "wedbar;": "\u2a5f",
+    "wedge;": "\u2227",
+    "wedgeq;": "\u2259",
+    "weierp;": "\u2118",
+    "wfr;": "\U0001d534",
+    "wopf;": "\U0001d568",
+    "wp;": "\u2118",
+    "wr;": "\u2240",
+    "wreath;": "\u2240",
+    "wscr;": "\U0001d4cc",
+    "xcap;": "\u22c2",
+    "xcirc;": "\u25ef",
+    "xcup;": "\u22c3",
+    "xdtri;": "\u25bd",
+    "xfr;": "\U0001d535",
+    "xhArr;": "\u27fa",
+    "xharr;": "\u27f7",
+    "xi;": "\u03be",
+    "xlArr;": "\u27f8",
+    "xlarr;": "\u27f5",
+    "xmap;": "\u27fc",
+    "xnis;": "\u22fb",
+    "xodot;": "\u2a00",
+    "xopf;": "\U0001d569",
+    "xoplus;": "\u2a01",
+    "xotime;": "\u2a02",
+    "xrArr;": "\u27f9",
+    "xrarr;": "\u27f6",
+    "xscr;": "\U0001d4cd",
+    "xsqcup;": "\u2a06",
+    "xuplus;": "\u2a04",
+    "xutri;": "\u25b3",
+    "xvee;": "\u22c1",
+    "xwedge;": "\u22c0",
+    "yacute": "\xfd",
+    "yacute;": "\xfd",
+    "yacy;": "\u044f",
+    "ycirc;": "\u0177",
+    "ycy;": "\u044b",
+    "yen": "\xa5",
+    "yen;": "\xa5",
+    "yfr;": "\U0001d536",
+    "yicy;": "\u0457",
+    "yopf;": "\U0001d56a",
+    "yscr;": "\U0001d4ce",
+    "yucy;": "\u044e",
+    "yuml": "\xff",
+    "yuml;": "\xff",
+    "zacute;": "\u017a",
+    "zcaron;": "\u017e",
+    "zcy;": "\u0437",
+    "zdot;": "\u017c",
+    "zeetrf;": "\u2128",
+    "zeta;": "\u03b6",
+    "zfr;": "\U0001d537",
+    "zhcy;": "\u0436",
+    "zigrarr;": "\u21dd",
+    "zopf;": "\U0001d56b",
+    "zscr;": "\U0001d4cf",
+    "zwj;": "\u200d",
+    "zwnj;": "\u200c",
 }
 
 replacementCharacters = {
-    0x0:u"\uFFFD",
-    0x0d:u"\u000A",
-    0x80:u"\u20AC",
-    0x81:u"\u0081",
-    0x81:u"\u0081",
-    0x82:u"\u201A",
-    0x83:u"\u0192",
-    0x84:u"\u201E",
-    0x85:u"\u2026",
-    0x86:u"\u2020",
-    0x87:u"\u2021",
-    0x88:u"\u02C6",
-    0x89:u"\u2030",
-    0x8A:u"\u0160",
-    0x8B:u"\u2039",
-    0x8C:u"\u0152",
-    0x8D:u"\u008D",
-    0x8E:u"\u017D",
-    0x8F:u"\u008F",
-    0x90:u"\u0090",
-    0x91:u"\u2018",
-    0x92:u"\u2019",
-    0x93:u"\u201C",
-    0x94:u"\u201D",
-    0x95:u"\u2022",
-    0x96:u"\u2013",
-    0x97:u"\u2014",
-    0x98:u"\u02DC",
-    0x99:u"\u2122",
-    0x9A:u"\u0161",
-    0x9B:u"\u203A",
-    0x9C:u"\u0153",
-    0x9D:u"\u009D",
-    0x9E:u"\u017E",
-    0x9F:u"\u0178",
+    0x0: "\uFFFD",
+    0x0d: "\u000D",
+    0x80: "\u20AC",
+    0x81: "\u0081",
+    0x81: "\u0081",
+    0x82: "\u201A",
+    0x83: "\u0192",
+    0x84: "\u201E",
+    0x85: "\u2026",
+    0x86: "\u2020",
+    0x87: "\u2021",
+    0x88: "\u02C6",
+    0x89: "\u2030",
+    0x8A: "\u0160",
+    0x8B: "\u2039",
+    0x8C: "\u0152",
+    0x8D: "\u008D",
+    0x8E: "\u017D",
+    0x8F: "\u008F",
+    0x90: "\u0090",
+    0x91: "\u2018",
+    0x92: "\u2019",
+    0x93: "\u201C",
+    0x94: "\u201D",
+    0x95: "\u2022",
+    0x96: "\u2013",
+    0x97: "\u2014",
+    0x98: "\u02DC",
+    0x99: "\u2122",
+    0x9A: "\u0161",
+    0x9B: "\u203A",
+    0x9C: "\u0153",
+    0x9D: "\u009D",
+    0x9E: "\u017E",
+    0x9F: "\u0178",
 }
 
 encodings = {
@@ -1147,25 +3078,27 @@
     'x-x-big5': 'big5'}
 
 tokenTypes = {
-    "Doctype":0,
-    "Characters":1,
-    "SpaceCharacters":2,
-    "StartTag":3,
-    "EndTag":4,
-    "EmptyTag":5,
-    "Comment":6,
-    "ParseError":7
+    "Doctype": 0,
+    "Characters": 1,
+    "SpaceCharacters": 2,
+    "StartTag": 3,
+    "EndTag": 4,
+    "EmptyTag": 5,
+    "Comment": 6,
+    "ParseError": 7
 }
 
-tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], 
+tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
                            tokenTypes["EmptyTag"]))
 
 
-prefixes = dict([(v,k) for k,v in namespaces.iteritems()])
+prefixes = dict([(v, k) for k, v in namespaces.items()])
 prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
 
+
 class DataLossWarning(UserWarning):
     pass
 
+
 class ReparseException(Exception):
     pass
diff --git a/planet/vendor/html5lib/filters/_base.py b/planet/vendor/html5lib/filters/_base.py
index bca94ad..c7dbaed 100644
--- a/planet/vendor/html5lib/filters/_base.py
+++ b/planet/vendor/html5lib/filters/_base.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
 
 class Filter(object):
     def __init__(self, source):
diff --git a/planet/vendor/html5lib/filters/alphabeticalattributes.py b/planet/vendor/html5lib/filters/alphabeticalattributes.py
new file mode 100644
index 0000000..fed6996
--- /dev/null
+++ b/planet/vendor/html5lib/filters/alphabeticalattributes.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import _base
+
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
+
+
+class Filter(_base.Filter):
+    def __iter__(self):
+        for token in _base.Filter.__iter__(self):
+            if token["type"] in ("StartTag", "EmptyTag"):
+                attrs = OrderedDict()
+                for name, value in sorted(token["data"].items(),
+                                          key=lambda x: x[0]):
+                    attrs[name] = value
+                token["data"] = attrs
+            yield token
diff --git a/planet/vendor/html5lib/filters/formfiller.py b/planet/vendor/html5lib/filters/formfiller.py
deleted file mode 100644
index 9400171..0000000
--- a/planet/vendor/html5lib/filters/formfiller.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#
-# The goal is to finally have a form filler where you pass data for
-# each form, using the algorithm for "Seeding a form with initial values"
-# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
-#
-
-import _base
-
-from html5lib.constants import spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
-
-class SimpleFilter(_base.Filter):
-    def __init__(self, source, fieldStorage):
-        _base.Filter.__init__(self, source)
-        self.fieldStorage = fieldStorage
-
-    def __iter__(self):
-        field_indices = {}
-        state = None
-        field_name = None
-        for token in _base.Filter.__iter__(self):
-            type = token["type"]
-            if type in ("StartTag", "EmptyTag"):
-                name = token["name"].lower()
-                if name == "input":
-                    field_name = None
-                    field_type = None
-                    input_value_index = -1
-                    input_checked_index = -1
-                    for i,(n,v) in enumerate(token["data"]):
-                        n = n.lower()
-                        if n == u"name":
-                            field_name = v.strip(spaceCharacters)
-                        elif n == u"type":
-                            field_type = v.strip(spaceCharacters)
-                        elif n == u"checked":
-                            input_checked_index = i
-                        elif n == u"value":
-                            input_value_index = i
-
-                    value_list = self.fieldStorage.getlist(field_name)
-                    field_index = field_indices.setdefault(field_name, 0)
-                    if field_index < len(value_list):
-                        value = value_list[field_index]
-                    else:
-                        value = ""
-
-                    if field_type in (u"checkbox", u"radio"):
-                        if value_list:
-                            if token["data"][input_value_index][1] == value:
-                                if input_checked_index < 0:
-                                    token["data"].append((u"checked", u""))
-                                field_indices[field_name] = field_index + 1
-                            elif input_checked_index >= 0:
-                                del token["data"][input_checked_index]
-
-                    elif field_type not in (u"button", u"submit", u"reset"):
-                        if input_value_index >= 0:
-                            token["data"][input_value_index] = (u"value", value)
-                        else:
-                            token["data"].append((u"value", value))
-                        field_indices[field_name] = field_index + 1
-
-                    field_type = None
-                    field_name = None
-
-                elif name == "textarea":
-                    field_type = "textarea"
-                    field_name = dict((token["data"])[::-1])["name"]
-
-                elif name == "select":
-                    field_type = "select"
-                    attributes = dict(token["data"][::-1])
-                    field_name = attributes.get("name")
-                    is_select_multiple = "multiple" in attributes
-                    is_selected_option_found = False
-
-                elif field_type == "select" and field_name and name == "option":
-                    option_selected_index = -1
-                    option_value = None
-                    for i,(n,v) in enumerate(token["data"]):
-                        n = n.lower()
-                        if n == "selected":
-                            option_selected_index = i
-                        elif n == "value":
-                            option_value = v.strip(spaceCharacters)
-                    if option_value is None:
-                        raise NotImplementedError("<option>s without a value= attribute")
-                    else:
-                        value_list = self.fieldStorage.getlist(field_name)
-                        if value_list:
-                            field_index = field_indices.setdefault(field_name, 0)
-                            if field_index < len(value_list):
-                                value = value_list[field_index]
-                            else:
-                                value = ""
-                            if (is_select_multiple or not is_selected_option_found) and option_value == value:
-                                if option_selected_index < 0:
-                                    token["data"].append((u"selected", u""))
-                                field_indices[field_name] = field_index + 1
-                                is_selected_option_found = True
-                            elif option_selected_index >= 0:
-                                del token["data"][option_selected_index]
-
-            elif field_type is not None and field_name and type == "EndTag":
-                name = token["name"].lower()
-                if name == field_type:
-                    if name == "textarea":
-                        value_list = self.fieldStorage.getlist(field_name)
-                        if value_list:
-                            field_index = field_indices.setdefault(field_name, 0)
-                            if field_index < len(value_list):
-                                value = value_list[field_index]
-                            else:
-                                value = ""
-                            yield {"type": "Characters", "data": value}
-                            field_indices[field_name] = field_index + 1
-
-                    field_name = None
-
-                elif name == "option" and field_type == "select":
-                    pass # TODO: part of "option without value= attribute" processing
-
-            elif field_type == "textarea":
-                continue # ignore token
-
-            yield token
diff --git a/planet/vendor/html5lib/filters/inject_meta_charset.py b/planet/vendor/html5lib/filters/inject_meta_charset.py
index 35a2d95..ca33b70 100644
--- a/planet/vendor/html5lib/filters/inject_meta_charset.py
+++ b/planet/vendor/html5lib/filters/inject_meta_charset.py
@@ -1,4 +1,7 @@
-import _base
+from __future__ import absolute_import, division, unicode_literals
+
+from . import _base
+
 
 class Filter(_base.Filter):
     def __init__(self, source, encoding):
@@ -18,29 +21,28 @@ def __iter__(self):
 
             elif type == "EmptyTag":
                 if token["name"].lower() == "meta":
-                   # replace charset with actual encoding
-                   has_http_equiv_content_type = False
-                   content_index = -1
-                   for i,(name,value) in enumerate(token["data"]):
-                       if name.lower() == 'charset':
-                          token["data"][i] = (u'charset', self.encoding)
-                          meta_found = True
-                          break
-                       elif name == 'http-equiv' and value.lower() == 'content-type':
-                           has_http_equiv_content_type = True
-                       elif name == 'content':
-                           content_index = i
-                   else:
-                       if has_http_equiv_content_type and content_index >= 0:
-                           token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
-                           meta_found = True
+                    # replace charset with actual encoding
+                    has_http_equiv_content_type = False
+                    for (namespace, name), value in token["data"].items():
+                        if namespace is not None:
+                            continue
+                        elif name.lower() == 'charset':
+                            token["data"][(namespace, name)] = self.encoding
+                            meta_found = True
+                            break
+                        elif name == 'http-equiv' and value.lower() == 'content-type':
+                            has_http_equiv_content_type = True
+                    else:
+                        if has_http_equiv_content_type and (None, "content") in token["data"]:
+                            token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
+                            meta_found = True
 
                 elif token["name"].lower() == "head" and not meta_found:
                     # insert meta into empty head
                     yield {"type": "StartTag", "name": "head",
                            "data": token["data"]}
                     yield {"type": "EmptyTag", "name": "meta",
-                           "data": [["charset", self.encoding]]}
+                           "data": {(None, "charset"): self.encoding}}
                     yield {"type": "EndTag", "name": "head"}
                     meta_found = True
                     continue
@@ -51,7 +53,7 @@ def __iter__(self):
                     yield pending.pop(0)
                     if not meta_found:
                         yield {"type": "EmptyTag", "name": "meta",
-                               "data": [["charset", self.encoding]]}
+                               "data": {(None, "charset"): self.encoding}}
                     while pending:
                         yield pending.pop(0)
                     meta_found = True
diff --git a/planet/vendor/html5lib/filters/lint.py b/planet/vendor/html5lib/filters/lint.py
index ea5c619..7cc99a4 100644
--- a/planet/vendor/html5lib/filters/lint.py
+++ b/planet/vendor/html5lib/filters/lint.py
@@ -1,13 +1,18 @@
+from __future__ import absolute_import, division, unicode_literals
+
 from gettext import gettext
 _ = gettext
 
-import _base
-from html5lib.constants import cdataElements, rcdataElements, voidElements
+from . import _base
+from ..constants import cdataElements, rcdataElements, voidElements
+
+from ..constants import spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
 
-from html5lib.constants import spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+class LintError(Exception):
+    pass
 
-class LintError(Exception): pass
 
 class Filter(_base.Filter):
     def __iter__(self):
@@ -18,24 +23,24 @@ def __iter__(self):
             if type in ("StartTag", "EmptyTag"):
                 name = token["name"]
                 if contentModelFlag != "PCDATA":
-                    raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
-                if not isinstance(name, unicode):
-                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                    raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
+                if not isinstance(name, str):
+                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
                 if not name:
-                    raise LintError(_(u"Empty tag name"))
+                    raise LintError(_("Empty tag name"))
                 if type == "StartTag" and name in voidElements:
-                    raise LintError(_(u"Void element reported as StartTag token: %s") % name)
+                    raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
                 elif type == "EmptyTag" and name not in voidElements:
-                    raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
+                    raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
                 if type == "StartTag":
                     open_elements.append(name)
                 for name, value in token["data"]:
-                    if not isinstance(name, unicode):
-                        raise LintError(_("Attribute name is not a string: %r") % name)
+                    if not isinstance(name, str):
+                        raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
                     if not name:
-                        raise LintError(_(u"Empty attribute name"))
-                    if not isinstance(value, unicode):
-                        raise LintError(_("Attribute value is not a string: %r") % value)
+                        raise LintError(_("Empty attribute name"))
+                    if not isinstance(value, str):
+                        raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
                 if name in cdataElements:
                     contentModelFlag = "CDATA"
                 elif name in rcdataElements:
@@ -45,15 +50,15 @@ def __iter__(self):
 
             elif type == "EndTag":
                 name = token["name"]
-                if not isinstance(name, unicode):
-                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                if not isinstance(name, str):
+                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
                 if not name:
-                    raise LintError(_(u"Empty tag name"))
+                    raise LintError(_("Empty tag name"))
                 if name in voidElements:
-                    raise LintError(_(u"Void element reported as EndTag token: %s") % name)
+                    raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
                 start_name = open_elements.pop()
                 if start_name != name:
-                    raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+                    raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
                 contentModelFlag = "PCDATA"
 
             elif type == "Comment":
@@ -62,27 +67,27 @@ def __iter__(self):
 
             elif type in ("Characters", "SpaceCharacters"):
                 data = token["data"]
-                if not isinstance(data, unicode):
-                    raise LintError(_("Attribute name is not a string: %r") % data)
+                if not isinstance(data, str):
+                    raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
                 if not data:
-                    raise LintError(_(u"%s token with empty data") % type)
+                    raise LintError(_("%(type)s token with empty data") % {"type": type})
                 if type == "SpaceCharacters":
                     data = data.strip(spaceCharacters)
                     if data:
-                        raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
+                        raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
 
             elif type == "Doctype":
                 name = token["name"]
                 if contentModelFlag != "PCDATA":
-                    raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
-                if not isinstance(name, unicode):
-                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                    raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
+                if not isinstance(name, str):
+                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
                 # XXX: what to do with token["data"] ?
 
             elif type in ("ParseError", "SerializeError"):
                 pass
 
             else:
-                raise LintError(_(u"Unknown token type: %s") % type)
+                raise LintError(_("Unknown token type: %(type)s") % {"type": type})
 
             yield token
diff --git a/planet/vendor/html5lib/filters/optionaltags.py b/planet/vendor/html5lib/filters/optionaltags.py
index a77aa72..fefe0b3 100644
--- a/planet/vendor/html5lib/filters/optionaltags.py
+++ b/planet/vendor/html5lib/filters/optionaltags.py
@@ -1,4 +1,7 @@
-import _base
+from __future__ import absolute_import, division, unicode_literals
+
+from . import _base
+
 
 class Filter(_base.Filter):
     def slider(self):
@@ -14,8 +17,8 @@ def __iter__(self):
         for previous, token, next in self.slider():
             type = token["type"]
             if type == "StartTag":
-                if (token["data"] or 
-                    not self.is_optional_start(token["name"], previous, next)):
+                if (token["data"] or
+                        not self.is_optional_start(token["name"], previous, next)):
                     yield token
             elif type == "EndTag":
                 if not self.is_optional_end(token["name"], next):
@@ -73,7 +76,7 @@ def is_optional_start(self, tagname, previous, next):
                 # omit the thead and tfoot elements' end tag when they are
                 # immediately followed by a tbody element. See is_optional_end.
                 if previous and previous['type'] == 'EndTag' and \
-                  previous['name'] in ('tbody','thead','tfoot'):
+                        previous['name'] in ('tbody', 'thead', 'tfoot'):
                     return False
                 return next["name"] == 'tr'
             else:
@@ -121,10 +124,10 @@ def is_optional_end(self, tagname, next):
             # there is no more content in the parent element.
             if type in ("StartTag", "EmptyTag"):
                 return next["name"] in ('address', 'article', 'aside',
-                                        'blockquote', 'datagrid', 'dialog', 
+                                        'blockquote', 'datagrid', 'dialog',
                                         'dir', 'div', 'dl', 'fieldset', 'footer',
                                         'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-                                        'header', 'hr', 'menu', 'nav', 'ol', 
+                                        'header', 'hr', 'menu', 'nav', 'ol',
                                         'p', 'pre', 'section', 'table', 'ul')
             else:
                 return type == "EndTag" or type is None
diff --git a/planet/vendor/html5lib/filters/sanitizer.py b/planet/vendor/html5lib/filters/sanitizer.py
index 0023527..b206b54 100644
--- a/planet/vendor/html5lib/filters/sanitizer.py
+++ b/planet/vendor/html5lib/filters/sanitizer.py
@@ -1,8 +1,12 @@
-import _base
-from html5lib.sanitizer import HTMLSanitizerMixin
+from __future__ import absolute_import, division, unicode_literals
+
+from . import _base
+from ..sanitizer import HTMLSanitizerMixin
+
 
 class Filter(_base.Filter, HTMLSanitizerMixin):
     def __iter__(self):
         for token in _base.Filter.__iter__(self):
             token = self.sanitize_token(token)
-            if token: yield token
+            if token:
+                yield token
diff --git a/planet/vendor/html5lib/filters/whitespace.py b/planet/vendor/html5lib/filters/whitespace.py
index 74d6f4d..dfc60ee 100644
--- a/planet/vendor/html5lib/filters/whitespace.py
+++ b/planet/vendor/html5lib/filters/whitespace.py
@@ -1,16 +1,13 @@
-try:
-    frozenset
-except NameError:
-    # Import from the sets module for python 2.3
-    from sets import ImmutableSet as frozenset
+from __future__ import absolute_import, division, unicode_literals
 
 import re
 
-import _base
-from html5lib.constants import rcdataElements, spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+from . import _base
+from ..constants import rcdataElements, spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
 
-SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
 
 class Filter(_base.Filter):
 
@@ -21,7 +18,7 @@ def __iter__(self):
         for token in _base.Filter.__iter__(self):
             type = token["type"]
             if type == "StartTag" \
-              and (preserve or token["name"] in self.spacePreserveElements):
+                    and (preserve or token["name"] in self.spacePreserveElements):
                 preserve += 1
 
             elif type == "EndTag" and preserve:
@@ -29,13 +26,13 @@ def __iter__(self):
 
             elif not preserve and type == "SpaceCharacters" and token["data"]:
                 # Test on token["data"] above to not introduce spaces where there were not
-                token["data"] = u" "
+                token["data"] = " "
 
             elif not preserve and type == "Characters":
                 token["data"] = collapse_spaces(token["data"])
 
             yield token
 
+
 def collapse_spaces(text):
     return SPACES_REGEX.sub(' ', text)
-
diff --git a/planet/vendor/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py
index 5ff742a..ea4014f 100644
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@@ -1,74 +1,65 @@
-try:
-    frozenset
-except NameError:
-    # Import from the sets module for python 2.3
-    from sets import Set as set
-    from sets import ImmutableSet as frozenset
-
-try:
-    any
-except:
-    # Implement 'any' for python 2.4 and previous
-    def any(iterable):
-        for element in iterable:
-            if element:
-                return True
-        return False
-        
-try:
-    "abc".startswith(("a", "b"))
-    def startswithany(str, prefixes):
-        return str.startswith(prefixes)
-except:
-    # Python 2.4 doesn't accept a tuple as argument to string startswith
-    def startswithany(str, prefixes):
-        for prefix in prefixes:
-            if str.startswith(prefix):
-                return True
-        return False
-
-import sys
-
-import inputstream
-import tokenizer
-
-import treebuilders
-from treebuilders._base import Marker
-from treebuilders import simpletree
-
-import utils
-from constants import spaceCharacters, asciiUpper2Lower
-from constants import scopingElements, formattingElements, specialElements
-from constants import headingElements, tableInsertModeElements
-from constants import cdataElements, rcdataElements, voidElements
-from constants import tokenTypes, ReparseException, namespaces
-
-def parse(doc, treebuilder="simpletree", encoding=None, 
+from __future__ import absolute_import, division, unicode_literals
+from six import with_metaclass
+
+import types
+
+from . import inputstream
+from . import tokenizer
+
+from . import treebuilders
+from .treebuilders._base import Marker
+
+from . import utils
+from . import constants
+from .constants import spaceCharacters, asciiUpper2Lower
+from .constants import specialElements
+from .constants import headingElements
+from .constants import cdataElements, rcdataElements
+from .constants import tokenTypes, ReparseException, namespaces
+from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
+from .constants import adjustForeignAttributes as adjustForeignAttributesMap
+
+
+def parse(doc, treebuilder="etree", encoding=None,
           namespaceHTMLElements=True):
+    """Parse a string or file-like object into a tree"""
     tb = treebuilders.getTreeBuilder(treebuilder)
     p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
     return p.parse(doc, encoding=encoding)
 
-def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None, 
+
+def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
                   namespaceHTMLElements=True):
     tb = treebuilders.getTreeBuilder(treebuilder)
     p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
     return p.parseFragment(doc, container=container, encoding=encoding)
 
+
+def method_decorator_metaclass(function):
+    class Decorated(type):
+        def __new__(meta, classname, bases, classDict):
+            for attributeName, attribute in classDict.items():
+                if isinstance(attribute, types.FunctionType):
+                    attribute = function(attribute)
+
+                classDict[attributeName] = attribute
+            return type.__new__(meta, classname, bases, classDict)
+    return Decorated
+
+
 class HTMLParser(object):
     """HTML parser. Generates a tree structure from a stream of (possibly
         malformed) HTML"""
 
-    def __init__(self, tree = simpletree.TreeBuilder,
-                 tokenizer = tokenizer.HTMLTokenizer, strict = False,
-                 namespaceHTMLElements = True):
+    def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
+                 strict=False, namespaceHTMLElements=True, debug=False):
         """
         strict - raise an exception when a parse error is encountered
 
         tree - a treebuilder class controlling the type of tree that will be
         returned. Built in treebuilders can be accessed through
         html5lib.treebuilders.getTreeBuilder(treeType)
-        
+
         tokenizer - a class that provides a stream of tokens to the treebuilder.
         This may be replaced for e.g. a sanitizer which converts some tags to
         text
@@ -77,36 +68,14 @@ def __init__(self, tree = simpletree.TreeBuilder,
         # Raise an exception on the first error encountered
         self.strict = strict
 
+        if tree is None:
+            tree = treebuilders.getTreeBuilder("etree")
         self.tree = tree(namespaceHTMLElements)
         self.tokenizer_class = tokenizer
         self.errors = []
 
-        self.phases = {
-            "initial": InitialPhase(self, self.tree),
-            "beforeHtml": BeforeHtmlPhase(self, self.tree),
-            "beforeHead": BeforeHeadPhase(self, self.tree),
-            "inHead": InHeadPhase(self, self.tree),
-            # XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
-            "afterHead": AfterHeadPhase(self, self.tree),
-            "inBody": InBodyPhase(self, self.tree),
-            "text": TextPhase(self, self.tree),
-            "inTable": InTablePhase(self, self.tree),
-            "inTableText": InTableTextPhase(self, self.tree),
-            "inCaption": InCaptionPhase(self, self.tree),
-            "inColumnGroup": InColumnGroupPhase(self, self.tree),
-            "inTableBody": InTableBodyPhase(self, self.tree),
-            "inRow": InRowPhase(self, self.tree),
-            "inCell": InCellPhase(self, self.tree),
-            "inSelect": InSelectPhase(self, self.tree),
-            "inSelectInTable": InSelectInTablePhase(self, self.tree),
-            "inForeignContent": InForeignContentPhase(self, self.tree),
-            "afterBody": AfterBodyPhase(self, self.tree),
-            "inFrameset": InFramesetPhase(self, self.tree),
-            "afterFrameset": AfterFramesetPhase(self, self.tree),
-            "afterAfterBody": AfterAfterBodyPhase(self, self.tree),
-            "afterAfterFrameset": AfterAfterFramesetPhase(self, self.tree),
-            # XXX after after frameset
-        }
+        self.phases = dict([(name, cls(self, self.tree)) for name, cls in
+                            getPhases(debug).items()])
 
     def _parse(self, stream, innerHTML=False, container="div",
                encoding=None, parseMeta=True, useChardet=True, **kwargs):
@@ -115,20 +84,22 @@ def _parse(self, stream, innerHTML=False, container="div",
         self.container = container
         self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
                                               parseMeta=parseMeta,
-                                              useChardet=useChardet, **kwargs)
+                                              useChardet=useChardet,
+                                              parser=self, **kwargs)
         self.reset()
 
         while True:
             try:
                 self.mainLoop()
                 break
-            except ReparseException, e:
+            except ReparseException:
                 self.reset()
 
     def reset(self):
         self.tree.reset()
         self.firstStartTag = False
         self.errors = []
+        self.log = []  # only used with debug mode
         # "quirks" / "limited quirks" / "no quirks"
         self.compatMode = "no quirks"
 
@@ -153,57 +124,96 @@ def reset(self):
             self.phase = self.phases["initial"]
 
         self.lastPhase = None
-        self.secondaryPhase = None
 
         self.beforeRCDataPhase = None
 
         self.framesetOK = True
-        
-    def mainLoop(self):
-        (CharactersToken, 
-         SpaceCharactersToken, 
-         StartTagToken,
-         EndTagToken, 
-         CommentToken,
-         DoctypeToken) = (tokenTypes["Characters"],
-                          tokenTypes["SpaceCharacters"],
-                          tokenTypes["StartTag"],
-                          tokenTypes["EndTag"],
-                          tokenTypes["Comment"],
-                          tokenTypes["Doctype"])
 
+    @property
+    def documentEncoding(self):
+        """The name of the character encoding
+        that was used to decode the input stream,
+        or :obj:`None` if that is not determined yet.
+
+        """
+        if not hasattr(self, 'tokenizer'):
+            return None
+        return self.tokenizer.stream.charEncoding[0]
+
+    def isHTMLIntegrationPoint(self, element):
+        if (element.name == "annotation-xml" and
+                element.namespace == namespaces["mathml"]):
+            return ("encoding" in element.attributes and
+                    element.attributes["encoding"].translate(
+                        asciiUpper2Lower) in
+                    ("text/html", "application/xhtml+xml"))
+        else:
+            return (element.namespace, element.name) in htmlIntegrationPointElements
+
+    def isMathMLTextIntegrationPoint(self, element):
+        return (element.namespace, element.name) in mathmlTextIntegrationPointElements
+
+    def mainLoop(self):
         CharactersToken = tokenTypes["Characters"]
         SpaceCharactersToken = tokenTypes["SpaceCharacters"]
         StartTagToken = tokenTypes["StartTag"]
         EndTagToken = tokenTypes["EndTag"]
         CommentToken = tokenTypes["Comment"]
         DoctypeToken = tokenTypes["Doctype"]
-        
-        
+        ParseErrorToken = tokenTypes["ParseError"]
+
         for token in self.normalizedTokens():
-            type = token["type"]
-            if type == CharactersToken:
-                self.phase.processCharacters(token)
-            elif type == SpaceCharactersToken:
-                self.phase.processSpaceCharacters(token)
-            elif type == StartTagToken:
-                self.selfClosingAcknowledged = False
-                self.phase.processStartTag(token)
-                if (token["selfClosing"]
-                    and not self.selfClosingAcknowledged):
-                    self.parseError("non-void-element-with-trailing-solidus",
-                                    {"name":token["name"]})
-            elif type == EndTagToken:
-                self.phase.processEndTag(token)
-            elif type == CommentToken:
-                self.phase.processComment(token)
-            elif type == DoctypeToken:
-                self.phase.processDoctype(token)
-            else:
-                self.parseError(token["data"], token.get("datavars", {}))
+            new_token = token
+            while new_token is not None:
+                currentNode = self.tree.openElements[-1] if self.tree.openElements else None
+                currentNodeNamespace = currentNode.namespace if currentNode else None
+                currentNodeName = currentNode.name if currentNode else None
+
+                type = new_token["type"]
+
+                if type == ParseErrorToken:
+                    self.parseError(new_token["data"], new_token.get("datavars", {}))
+                    new_token = None
+                else:
+                    if (len(self.tree.openElements) == 0 or
+                        currentNodeNamespace == self.tree.defaultNamespace or
+                        (self.isMathMLTextIntegrationPoint(currentNode) and
+                         ((type == StartTagToken and token["name"] not in frozenset(["mglyph", "malignmark"])) or
+                          type in (CharactersToken, SpaceCharactersToken))) or
+                        (currentNodeNamespace == namespaces["mathml"] and
+                         currentNodeName == "annotation-xml" and "name" in token and token["name"] == "svg") or
+                        (self.isHTMLIntegrationPoint(currentNode) and
+                         type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
+                        phase = self.phase
+                    else:
+                        phase = self.phases["inForeignContent"]
+
+                    if type == CharactersToken:
+                        new_token = phase.processCharacters(new_token)
+                    elif type == SpaceCharactersToken:
+                        new_token = phase.processSpaceCharacters(new_token)
+                    elif type == StartTagToken:
+                        new_token = phase.processStartTag(new_token)
+                    elif type == EndTagToken:
+                        new_token = phase.processEndTag(new_token)
+                    elif type == CommentToken:
+                        new_token = phase.processComment(new_token)
+                    elif type == DoctypeToken:
+                        new_token = phase.processDoctype(new_token)
+
+            if (type == StartTagToken and token["selfClosing"]
+                    and not token["selfClosingAcknowledged"]):
+                self.parseError("non-void-element-with-trailing-solidus",
+                                {"name": token["name"]})
 
         # When the loop finishes it's EOF
-        self.phase.processEOF()
+        reprocess = True
+        phases = []
+        while reprocess:
+            phases.append(self.phase)
+            reprocess = self.phase.processEOF()
+            if reprocess:
+                assert self.phase not in phases
 
     def normalizedTokens(self):
         for token in self.tokenizer:
@@ -219,14 +229,14 @@ def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
         regardless of any BOM or later declaration (such as in a meta
         element)
         """
-        self._parse(stream, innerHTML=False, encoding=encoding, 
+        self._parse(stream, innerHTML=False, encoding=encoding,
                     parseMeta=parseMeta, useChardet=useChardet)
         return self.tree.getDocument()
-    
+
     def parseFragment(self, stream, container="div", encoding=None,
                       parseMeta=False, useChardet=True):
         """Parse a HTML fragment into a well-formed tree fragment
-        
+
         container - name of the element we're setting the innerHTML property
         if set to None, default to 'div'
 
@@ -255,162 +265,147 @@ def normalizeToken(self, token):
         return token
 
     def adjustMathMLAttributes(self, token):
-        replacements = {"definitionurl":"definitionURL"}
-        for k,v in replacements.iteritems():
+        replacements = {"definitionurl": "definitionURL"}
+        for k, v in replacements.items():
             if k in token["data"]:
                 token["data"][v] = token["data"][k]
                 del token["data"][k]
 
     def adjustSVGAttributes(self, token):
         replacements = {
-            "attributename" : "attributeName",
-            "attributetype" : "attributeType",
-            "basefrequency" : "baseFrequency",
-            "baseprofile" : "baseProfile",
-            "calcmode" : "calcMode",
-            "clippathunits" : "clipPathUnits",
-            "contentscripttype" : "contentScriptType",
-            "contentstyletype" : "contentStyleType",
-            "diffuseconstant" : "diffuseConstant",
-            "edgemode" : "edgeMode",
-            "externalresourcesrequired" : "externalResourcesRequired",
-            "filterres" : "filterRes",
-            "filterunits" : "filterUnits",
-            "glyphref" : "glyphRef",
-            "gradienttransform" : "gradientTransform",
-            "gradientunits" : "gradientUnits",
-            "kernelmatrix" : "kernelMatrix",
-            "kernelunitlength" : "kernelUnitLength",
-            "keypoints" : "keyPoints",
-            "keysplines" : "keySplines",
-            "keytimes" : "keyTimes",
-            "lengthadjust" : "lengthAdjust",
-            "limitingconeangle" : "limitingConeAngle",
-            "markerheight" : "markerHeight",
-            "markerunits" : "markerUnits",
-            "markerwidth" : "markerWidth",
-            "maskcontentunits" : "maskContentUnits",
-            "maskunits" : "maskUnits",
-            "numoctaves" : "numOctaves",
-            "pathlength" : "pathLength",
-            "patterncontentunits" : "patternContentUnits",
-            "patterntransform" : "patternTransform",
-            "patternunits" : "patternUnits",
-            "pointsatx" : "pointsAtX",
-            "pointsaty" : "pointsAtY",
-            "pointsatz" : "pointsAtZ",
-            "preservealpha" : "preserveAlpha",
-            "preserveaspectratio" : "preserveAspectRatio",
-            "primitiveunits" : "primitiveUnits",
-            "refx" : "refX",
-            "refy" : "refY",
-            "repeatcount" : "repeatCount",
-            "repeatdur" : "repeatDur",
-            "requiredextensions" : "requiredExtensions",
-            "requiredfeatures" : "requiredFeatures",
-            "specularconstant" : "specularConstant",
-            "specularexponent" : "specularExponent",
-            "spreadmethod" : "spreadMethod",
-            "startoffset" : "startOffset",
-            "stddeviation" : "stdDeviation",
-            "stitchtiles" : "stitchTiles",
-            "surfacescale" : "surfaceScale",
-            "systemlanguage" : "systemLanguage",
-            "tablevalues" : "tableValues",
-            "targetx" : "targetX",
-            "targety" : "targetY",
-            "textlength" : "textLength",
-            "viewbox" : "viewBox",
-            "viewtarget" : "viewTarget",
-            "xchannelselector" : "xChannelSelector",
-            "ychannelselector" : "yChannelSelector",
-            "zoomandpan" : "zoomAndPan"
-            }
-        for originalName in token["data"].keys():
+            "attributename": "attributeName",
+            "attributetype": "attributeType",
+            "basefrequency": "baseFrequency",
+            "baseprofile": "baseProfile",
+            "calcmode": "calcMode",
+            "clippathunits": "clipPathUnits",
+            "contentscripttype": "contentScriptType",
+            "contentstyletype": "contentStyleType",
+            "diffuseconstant": "diffuseConstant",
+            "edgemode": "edgeMode",
+            "externalresourcesrequired": "externalResourcesRequired",
+            "filterres": "filterRes",
+            "filterunits": "filterUnits",
+            "glyphref": "glyphRef",
+            "gradienttransform": "gradientTransform",
+            "gradientunits": "gradientUnits",
+            "kernelmatrix": "kernelMatrix",
+            "kernelunitlength": "kernelUnitLength",
+            "keypoints": "keyPoints",
+            "keysplines": "keySplines",
+            "keytimes": "keyTimes",
+            "lengthadjust": "lengthAdjust",
+            "limitingconeangle": "limitingConeAngle",
+            "markerheight": "markerHeight",
+            "markerunits": "markerUnits",
+            "markerwidth": "markerWidth",
+            "maskcontentunits": "maskContentUnits",
+            "maskunits": "maskUnits",
+            "numoctaves": "numOctaves",
+            "pathlength": "pathLength",
+            "patterncontentunits": "patternContentUnits",
+            "patterntransform": "patternTransform",
+            "patternunits": "patternUnits",
+            "pointsatx": "pointsAtX",
+            "pointsaty": "pointsAtY",
+            "pointsatz": "pointsAtZ",
+            "preservealpha": "preserveAlpha",
+            "preserveaspectratio": "preserveAspectRatio",
+            "primitiveunits": "primitiveUnits",
+            "refx": "refX",
+            "refy": "refY",
+            "repeatcount": "repeatCount",
+            "repeatdur": "repeatDur",
+            "requiredextensions": "requiredExtensions",
+            "requiredfeatures": "requiredFeatures",
+            "specularconstant": "specularConstant",
+            "specularexponent": "specularExponent",
+            "spreadmethod": "spreadMethod",
+            "startoffset": "startOffset",
+            "stddeviation": "stdDeviation",
+            "stitchtiles": "stitchTiles",
+            "surfacescale": "surfaceScale",
+            "systemlanguage": "systemLanguage",
+            "tablevalues": "tableValues",
+            "targetx": "targetX",
+            "targety": "targetY",
+            "textlength": "textLength",
+            "viewbox": "viewBox",
+            "viewtarget": "viewTarget",
+            "xchannelselector": "xChannelSelector",
+            "ychannelselector": "yChannelSelector",
+            "zoomandpan": "zoomAndPan"
+        }
+        for originalName in list(token["data"].keys()):
             if originalName in replacements:
                 svgName = replacements[originalName]
                 token["data"][svgName] = token["data"][originalName]
                 del token["data"][originalName]
 
     def adjustForeignAttributes(self, token):
-        replacements = {
-            "xlink:actuate":("xlink", "actuate", namespaces["xlink"]),
-            "xlink:arcrole":("xlink", "arcrole", namespaces["xlink"]),
-            "xlink:href":("xlink", "href", namespaces["xlink"]),
-            "xlink:role":("xlink", "role", namespaces["xlink"]),
-            "xlink:show":("xlink", "show", namespaces["xlink"]),
-            "xlink:title":("xlink", "title", namespaces["xlink"]),
-            "xlink:type":("xlink", "type", namespaces["xlink"]),
-            "xml:base":("xml", "base", namespaces["xml"]),
-            "xml:lang":("xml", "lang", namespaces["xml"]),
-            "xml:space":("xml", "space", namespaces["xml"]),
-            "xmlns":(None, "xmlns", namespaces["xmlns"]),
-            "xmlns:xlink":("xmlns", "xlink", namespaces["xmlns"])
-            }
-
-        for originalName in token["data"].iterkeys():
+        replacements = adjustForeignAttributesMap
+
+        for originalName in token["data"].keys():
             if originalName in replacements:
                 foreignName = replacements[originalName]
                 token["data"][foreignName] = token["data"][originalName]
                 del token["data"][originalName]
 
+    def reparseTokenNormal(self, token):
+        self.parser.phase()
+
     def resetInsertionMode(self):
         # The name of this method is mostly historical. (It's also used in the
         # specification.)
         last = False
         newModes = {
-            "select":"inSelect",
-            "td":"inCell",
-            "th":"inCell",
-            "tr":"inRow",
-            "tbody":"inTableBody",
-            "thead":"inTableBody",
-            "tfoot":"inTableBody",
-            "caption":"inCaption",
-            "colgroup":"inColumnGroup",
-            "table":"inTable",
-            "head":"inBody",
-            "body":"inBody",
-            "frameset":"inFrameset"
+            "select": "inSelect",
+            "td": "inCell",
+            "th": "inCell",
+            "tr": "inRow",
+            "tbody": "inTableBody",
+            "thead": "inTableBody",
+            "tfoot": "inTableBody",
+            "caption": "inCaption",
+            "colgroup": "inColumnGroup",
+            "table": "inTable",
+            "head": "inBody",
+            "body": "inBody",
+            "frameset": "inFrameset",
+            "html": "beforeHead"
         }
         for node in self.tree.openElements[::-1]:
             nodeName = node.name
+            new_phase = None
             if node == self.tree.openElements[0]:
+                assert self.innerHTML
                 last = True
-                if nodeName not in ['td', 'th']:
-                    # XXX
-                    assert self.innerHTML
-                    nodeName = self.innerHTML
+                nodeName = self.innerHTML
             # Check for conditions that should only happen in the innerHTML
             # case
-            if nodeName in ("select", "colgroup", "head", "frameset"):
-                # XXX
+            if nodeName in ("select", "colgroup", "head", "html"):
                 assert self.innerHTML
+
+            if not last and node.namespace != self.tree.defaultNamespace:
+                continue
+
             if nodeName in newModes:
-                self.phase = self.phases[newModes[nodeName]]
-                break
-            elif node.namespace in (namespaces["mathml"], namespaces["svg"]):
-                self.phase = self.phases["inForeignContent"]
-                self.secondaryPhase = self.phases["inBody"]
-                break
-            elif nodeName == "html":
-                if self.tree.headPointer is None:
-                    self.phase = self.phases["beforeHead"]
-                else:
-                   self.phase = self.phases["afterHead"]
+                new_phase = self.phases[newModes[nodeName]]
                 break
             elif last:
-                self.phase = self.phases["inBody"]
+                new_phase = self.phases["inBody"]
                 break
 
+        self.phase = new_phase
+
     def parseRCDataRawtext(self, token, contentType):
         """Generic RCDATA/RAWTEXT Parsing algorithm
         contentType - RCDATA or RAWTEXT
         """
         assert contentType in ("RAWTEXT", "RCDATA")
-        
-        element = self.tree.insertElement(token)
-        
+
+        self.tree.insertElement(token)
+
         if contentType == "RAWTEXT":
             self.tokenizer.state = self.tokenizer.rawtextState
         else:
@@ -420,2217 +415,2309 @@ def parseRCDataRawtext(self, token, contentType):
 
         self.phase = self.phases["text"]
 
-class Phase(object):
-    """Base class for helper object that implements each phase of processing
-    """
-    # Order should be (they can be omitted):
-    # * EOF
-    # * Comment
-    # * Doctype
-    # * SpaceCharacters
-    # * Characters
-    # * StartTag
-    #   - startTag* methods
-    # * EndTag
-    #   - endTag* methods
-
-    def __init__(self, parser, tree):
-        self.parser = parser
-        self.tree = tree
-
-    def processEOF(self):
-        raise NotImplementedError
-
-    def processComment(self, token):
-        # For most phases the following is correct. Where it's not it will be
-        # overridden.
-        self.tree.insertComment(token, self.tree.openElements[-1])
-
-    def processDoctype(self, token):
-        self.parser.parseError("unexpected-doctype")
-
-    def processCharacters(self, token):
-        self.tree.insertText(token["data"])
-
-    def processSpaceCharacters(self, token):
-        self.tree.insertText(token["data"])
-
-    def processStartTag(self, token):
-        self.startTagHandler[token["name"]](token)
-
-    def startTagHtml(self, token):
-        if self.parser.firstStartTag == False and token["name"] == "html":
-           self.parser.parseError("non-html-root")
-        # XXX Need a check here to see if the first start tag token emitted is
-        # this token... If it's not, invoke self.parser.parseError().
-        for attr, value in token["data"].iteritems():
-            if attr not in self.tree.openElements[0].attributes:
-                self.tree.openElements[0].attributes[attr] = value
-        self.parser.firstStartTag = False
-
-    def processEndTag(self, token):
-        self.endTagHandler[token["name"]](token)
-
-class InitialPhase(Phase):
-    def processSpaceCharacters(self, token):
-        pass
-    
-    def processComment(self, token):
-        self.tree.insertComment(token, self.tree.document)
-
-    def processDoctype(self, token):
-        name = token["name"]
-        publicId = token["publicId"]
-        systemId = token["systemId"]
-        correct = token["correct"]
-
-        if (name != "html" or publicId != None or
-            systemId != None and systemId != "about:legacy-compat"):
-            self.parser.parseError("unknown-doctype")
-        
-        if publicId is None:
-            publicId = ""
-            
-        self.tree.insertDoctype(token)
-
-        if publicId != "":
-            publicId = publicId.translate(asciiUpper2Lower)
-
-        if (not correct or token["name"] != "html"
-            or startswithany(publicId,
-            ("+//silmaril//dtd html pro v0r11 19970101//",
-             "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
-             "-//as//dtd html 3.0 aswedit + extensions//",
-             "-//ietf//dtd html 2.0 level 1//",
-             "-//ietf//dtd html 2.0 level 2//",
-             "-//ietf//dtd html 2.0 strict level 1//",
-             "-//ietf//dtd html 2.0 strict level 2//",
-             "-//ietf//dtd html 2.0 strict//",
-             "-//ietf//dtd html 2.0//",
-             "-//ietf//dtd html 2.1e//",
-             "-//ietf//dtd html 3.0//",
-             "-//ietf//dtd html 3.2 final//",
-             "-//ietf//dtd html 3.2//",
-             "-//ietf//dtd html 3//",
-             "-//ietf//dtd html level 0//",
-             "-//ietf//dtd html level 1//",
-             "-//ietf//dtd html level 2//",
-             "-//ietf//dtd html level 3//",
-             "-//ietf//dtd html strict level 0//",
-             "-//ietf//dtd html strict level 1//",
-             "-//ietf//dtd html strict level 2//",
-             "-//ietf//dtd html strict level 3//",
-             "-//ietf//dtd html strict//",
-             "-//ietf//dtd html//",
-             "-//metrius//dtd metrius presentational//",
-             "-//microsoft//dtd internet explorer 2.0 html strict//",
-             "-//microsoft//dtd internet explorer 2.0 html//",
-             "-//microsoft//dtd internet explorer 2.0 tables//",
-             "-//microsoft//dtd internet explorer 3.0 html strict//",
-             "-//microsoft//dtd internet explorer 3.0 html//",
-             "-//microsoft//dtd internet explorer 3.0 tables//",
-             "-//netscape comm. corp.//dtd html//",
-             "-//netscape comm. corp.//dtd strict html//",
-             "-//o'reilly and associates//dtd html 2.0//",
-             "-//o'reilly and associates//dtd html extended 1.0//",
-             "-//o'reilly and associates//dtd html extended relaxed 1.0//",
-             "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
-             "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
-             "-//spyglass//dtd html 2.0 extended//",
-             "-//sq//dtd html 2.0 hotmetal + extensions//",
-             "-//sun microsystems corp.//dtd hotjava html//",
-             "-//sun microsystems corp.//dtd hotjava strict html//",
-             "-//w3c//dtd html 3 1995-03-24//",
-             "-//w3c//dtd html 3.2 draft//",
-             "-//w3c//dtd html 3.2 final//",
-             "-//w3c//dtd html 3.2//",
-             "-//w3c//dtd html 3.2s draft//",
-             "-//w3c//dtd html 4.0 frameset//",
-             "-//w3c//dtd html 4.0 transitional//",
-             "-//w3c//dtd html experimental 19960712//",
-             "-//w3c//dtd html experimental 970421//",
-             "-//w3c//dtd w3 html//",
-             "-//w3o//dtd w3 html 3.0//",
-             "-//webtechs//dtd mozilla html 2.0//",
-             "-//webtechs//dtd mozilla html//"))
-            or publicId in
-                ("-//w3o//dtd w3 html strict 3.0//en//",
-                 "-/w3c/dtd html 4.0 transitional/en",
-                 "html")
-            or startswithany(publicId,
-                ("-//w3c//dtd html 4.01 frameset//",
-                 "-//w3c//dtd html 4.01 transitional//")) and 
-                systemId == None
-            or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
-            self.parser.compatMode = "quirks"
-        elif (startswithany(publicId,
-                ("-//w3c//dtd xhtml 1.0 frameset//",
-                 "-//w3c//dtd xhtml 1.0 transitional//"))
-              or startswithany(publicId,
-                  ("-//w3c//dtd html 4.01 frameset//",
-                   "-//w3c//dtd html 4.01 transitional//")) and 
-                  systemId != None):
-            self.parser.compatMode = "limited quirks"
-
-        self.parser.phase = self.parser.phases["beforeHtml"]
-    
-    def anythingElse(self):
-        self.parser.compatMode = "quirks"
-        self.parser.phase = self.parser.phases["beforeHtml"]
-
-    def processCharacters(self, token):
-        self.parser.parseError("expected-doctype-but-got-chars")
-        self.anythingElse()
-        self.parser.phase.processCharacters(token)
-
-    def processStartTag(self, token):
-        self.parser.parseError("expected-doctype-but-got-start-tag",
-          {"name": token["name"]})
-        self.anythingElse()
-        self.parser.phase.processStartTag(token)
-
-    def processEndTag(self, token):
-        self.parser.parseError("expected-doctype-but-got-end-tag",
-          {"name": token["name"]})
-        self.anythingElse()
-        self.parser.phase.processEndTag(token)
-        
-    def processEOF(self):
-        self.parser.parseError("expected-doctype-but-got-eof")
-        self.anythingElse()
-        self.parser.phase.processEOF()
-
-
-class BeforeHtmlPhase(Phase):
-    # helper methods
-    def insertHtmlElement(self):
-        self.tree.insertRoot(impliedTagToken("html", "StartTag"))
-        self.parser.phase = self.parser.phases["beforeHead"]
-
-    # other
-    def processEOF(self):
-        self.insertHtmlElement()
-        self.parser.phase.processEOF()
-
-    def processComment(self, token):
-        self.tree.insertComment(token, self.tree.document)
-
-    def processSpaceCharacters(self, token):
-        pass
-
-    def processCharacters(self, token):
-        self.insertHtmlElement()
-        self.parser.phase.processCharacters(token)
-
-    def processStartTag(self, token):
-        if token["name"] == "html":
-            self.parser.firstStartTag = True
-        self.insertHtmlElement()
-        self.parser.phase.processStartTag(token)
-
-    def processEndTag(self, token):
-        if token["name"] not in ("head", "body", "html", "br"):
-            self.parser.parseError("unexpected-end-tag-before-html",
-              {"name": token["name"]})
-        else:
-            self.insertHtmlElement()
-            self.parser.phase.processEndTag(token)
 
+def getPhases(debug):
+    def log(function):
+        """Logger that records which phase processes each token"""
+        type_names = dict((value, key) for key, value in
+                          constants.tokenTypes.items())
+
+        def wrapped(self, *args, **kwargs):
+            if function.__name__.startswith("process") and len(args) > 0:
+                token = args[0]
+                try:
+                    info = {"type": type_names[token['type']]}
+                except:
+                    raise
+                if token['type'] in constants.tagTokenTypes:
+                    info["name"] = token['name']
+
+                self.parser.log.append((self.parser.tokenizer.state.__name__,
+                                        self.parser.phase.__class__.__name__,
+                                        self.__class__.__name__,
+                                        function.__name__,
+                                        info))
+                return function(self, *args, **kwargs)
+            else:
+                return function(self, *args, **kwargs)
+        return wrapped
 
-class BeforeHeadPhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
+    def getMetaclass(use_metaclass, metaclass_func):
+        if use_metaclass:
+            return method_decorator_metaclass(metaclass_func)
+        else:
+            return type
 
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("head", self.startTagHead)
-        ])
-        self.startTagHandler.default = self.startTagOther
+    class Phase(with_metaclass(getMetaclass(debug, log))):
+        """Base class for helper object that implements each phase of processing
+        """
 
-        self.endTagHandler = utils.MethodDispatcher([
-            (("head", "body", "html", "br"), self.endTagImplyHead)
-        ])
-        self.endTagHandler.default = self.endTagOther
+        def __init__(self, parser, tree):
+            self.parser = parser
+            self.tree = tree
 
-    def processEOF(self):
-        self.startTagHead(impliedTagToken("head", "StartTag"))
-        self.parser.phase.processEOF()
+        def processEOF(self):
+            raise NotImplementedError
 
-    def processSpaceCharacters(self, token):
-        pass
+        def processComment(self, token):
+            # For most phases the following is correct. Where it's not it will be
+            # overridden.
+            self.tree.insertComment(token, self.tree.openElements[-1])
 
-    def processCharacters(self, token):
-        self.startTagHead(impliedTagToken("head", "StartTag"))
-        self.parser.phase.processCharacters(token)
+        def processDoctype(self, token):
+            self.parser.parseError("unexpected-doctype")
 
-    def startTagHtml(self, token):
-        self.parser.phases["inBody"].processStartTag(token)
+        def processCharacters(self, token):
+            self.tree.insertText(token["data"])
 
-    def startTagHead(self, token):
-        self.tree.insertElement(token)
-        self.tree.headPointer = self.tree.openElements[-1]
-        self.parser.phase = self.parser.phases["inHead"]
-
-    def startTagOther(self, token):
-        self.startTagHead(impliedTagToken("head", "StartTag"))
-        self.parser.phase.processStartTag(token)
-
-    def endTagImplyHead(self, token):
-        self.startTagHead(impliedTagToken("head", "StartTag"))
-        self.parser.phase.processEndTag(token)
-
-    def endTagOther(self, token):
-        self.parser.parseError("end-tag-after-implied-root",
-          {"name": token["name"]})
-
-class InHeadPhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-
-        self.startTagHandler =  utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("title", self.startTagTitle),
-            (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
-            ("script", self.startTagScript),
-            (("base", "link", "command"), 
-             self.startTagBaseLinkCommand),
-            ("meta", self.startTagMeta),
-            ("head", self.startTagHead)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self. endTagHandler = utils.MethodDispatcher([
-            ("head", self.endTagHead),
-            (("br", "html", "body"), self.endTagHtmlBodyBr)
-        ])
-        self.endTagHandler.default = self.endTagOther
-
-    # helper
-    def appendToHead(self, element):
-        if self.tree.headPointer is not None:
-            self.tree.headPointer.appendChild(element)
-        else:
-            assert self.parser.innerHTML
-            self.tree.openElementsw[-1].appendChild(element)
+        def processSpaceCharacters(self, token):
+            self.tree.insertText(token["data"])
 
-    # the real thing
-    def processEOF (self):
-        self.anythingElse()
-        self.parser.phase.processEOF()
+        def processStartTag(self, token):
+            return self.startTagHandler[token["name"]](token)
 
-    def processCharacters(self, token):
-        self.anythingElse()
-        self.parser.phase.processCharacters(token)
+        def startTagHtml(self, token):
+            if not self.parser.firstStartTag and token["name"] == "html":
+                self.parser.parseError("non-html-root")
+            # XXX Need a check here to see if the first start tag token emitted is
+            # this token... If it's not, invoke self.parser.parseError().
+            for attr, value in token["data"].items():
+                if attr not in self.tree.openElements[0].attributes:
+                    self.tree.openElements[0].attributes[attr] = value
+            self.parser.firstStartTag = False
 
-    def startTagHtml(self, token):
-        self.parser.phases["inBody"].processStartTag(token)
+        def processEndTag(self, token):
+            return self.endTagHandler[token["name"]](token)
 
-    def startTagHead(self, token):
-        self.parser.parseError("two-heads-are-not-better-than-one")
+    class InitialPhase(Phase):
+        def processSpaceCharacters(self, token):
+            pass
 
-    def startTagBaseLinkCommand(self, token):
-        self.tree.insertElement(token)
-        self.tree.openElements.pop()
-        token["selfClosingAcknowledged"] = True
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+            correct = token["correct"]
+
+            if (name != "html" or publicId is not None or
+                    systemId is not None and systemId != "about:legacy-compat"):
+                self.parser.parseError("unknown-doctype")
+
+            if publicId is None:
+                publicId = ""
+
+            self.tree.insertDoctype(token)
+
+            if publicId != "":
+                publicId = publicId.translate(asciiUpper2Lower)
+
+            if (not correct or token["name"] != "html"
+                or publicId.startswith(
+                    ("+//silmaril//dtd html pro v0r11 19970101//",
+                     "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+                     "-//as//dtd html 3.0 aswedit + extensions//",
+                     "-//ietf//dtd html 2.0 level 1//",
+                     "-//ietf//dtd html 2.0 level 2//",
+                     "-//ietf//dtd html 2.0 strict level 1//",
+                     "-//ietf//dtd html 2.0 strict level 2//",
+                     "-//ietf//dtd html 2.0 strict//",
+                     "-//ietf//dtd html 2.0//",
+                     "-//ietf//dtd html 2.1e//",
+                     "-//ietf//dtd html 3.0//",
+                     "-//ietf//dtd html 3.2 final//",
+                     "-//ietf//dtd html 3.2//",
+                     "-//ietf//dtd html 3//",
+                     "-//ietf//dtd html level 0//",
+                     "-//ietf//dtd html level 1//",
+                     "-//ietf//dtd html level 2//",
+                     "-//ietf//dtd html level 3//",
+                     "-//ietf//dtd html strict level 0//",
+                     "-//ietf//dtd html strict level 1//",
+                     "-//ietf//dtd html strict level 2//",
+                     "-//ietf//dtd html strict level 3//",
+                     "-//ietf//dtd html strict//",
+                     "-//ietf//dtd html//",
+                     "-//metrius//dtd metrius presentational//",
+                     "-//microsoft//dtd internet explorer 2.0 html strict//",
+                     "-//microsoft//dtd internet explorer 2.0 html//",
+                     "-//microsoft//dtd internet explorer 2.0 tables//",
+                     "-//microsoft//dtd internet explorer 3.0 html strict//",
+                     "-//microsoft//dtd internet explorer 3.0 html//",
+                     "-//microsoft//dtd internet explorer 3.0 tables//",
+                     "-//netscape comm. corp.//dtd html//",
+                     "-//netscape comm. corp.//dtd strict html//",
+                     "-//o'reilly and associates//dtd html 2.0//",
+                     "-//o'reilly and associates//dtd html extended 1.0//",
+                     "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+                     "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+                     "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+                     "-//spyglass//dtd html 2.0 extended//",
+                     "-//sq//dtd html 2.0 hotmetal + extensions//",
+                     "-//sun microsystems corp.//dtd hotjava html//",
+                     "-//sun microsystems corp.//dtd hotjava strict html//",
+                     "-//w3c//dtd html 3 1995-03-24//",
+                     "-//w3c//dtd html 3.2 draft//",
+                     "-//w3c//dtd html 3.2 final//",
+                     "-//w3c//dtd html 3.2//",
+                     "-//w3c//dtd html 3.2s draft//",
+                     "-//w3c//dtd html 4.0 frameset//",
+                     "-//w3c//dtd html 4.0 transitional//",
+                     "-//w3c//dtd html experimental 19960712//",
+                     "-//w3c//dtd html experimental 970421//",
+                     "-//w3c//dtd w3 html//",
+                     "-//w3o//dtd w3 html 3.0//",
+                     "-//webtechs//dtd mozilla html 2.0//",
+                     "-//webtechs//dtd mozilla html//"))
+                or publicId in
+                    ("-//w3o//dtd w3 html strict 3.0//en//",
+                     "-/w3c/dtd html 4.0 transitional/en",
+                     "html")
+                or publicId.startswith(
+                    ("-//w3c//dtd html 4.01 frameset//",
+                     "-//w3c//dtd html 4.01 transitional//")) and
+                    systemId is None
+                    or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+                self.parser.compatMode = "quirks"
+            elif (publicId.startswith(
+                    ("-//w3c//dtd xhtml 1.0 frameset//",
+                     "-//w3c//dtd xhtml 1.0 transitional//"))
+                  or publicId.startswith(
+                      ("-//w3c//dtd html 4.01 frameset//",
+                       "-//w3c//dtd html 4.01 transitional//")) and
+                  systemId is not None):
+                self.parser.compatMode = "limited quirks"
+
+            self.parser.phase = self.parser.phases["beforeHtml"]
+
+        def anythingElse(self):
+            self.parser.compatMode = "quirks"
+            self.parser.phase = self.parser.phases["beforeHtml"]
+
+        def processCharacters(self, token):
+            self.parser.parseError("expected-doctype-but-got-chars")
+            self.anythingElse()
+            return token
+
+        def processStartTag(self, token):
+            self.parser.parseError("expected-doctype-but-got-start-tag",
+                                   {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def processEndTag(self, token):
+            self.parser.parseError("expected-doctype-but-got-end-tag",
+                                   {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def processEOF(self):
+            self.parser.parseError("expected-doctype-but-got-eof")
+            self.anythingElse()
+            return True
+
+    class BeforeHtmlPhase(Phase):
+        # helper methods
+        def insertHtmlElement(self):
+            self.tree.insertRoot(impliedTagToken("html", "StartTag"))
+            self.parser.phase = self.parser.phases["beforeHead"]
+
+        # other
+        def processEOF(self):
+            self.insertHtmlElement()
+            return True
 
-    def startTagMeta(self, token):
-        self.tree.insertElement(token)
-        self.tree.openElements.pop()
-        token["selfClosingAcknowledged"] = True
-
-        attributes = token["data"]
-        if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
-            if "charset" in attributes:
-                self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
-            elif "content" in attributes:
-                # Encoding it as UTF-8 here is a hack, as really we should pass
-                # the abstract Unicode string, and just use the
-                # ContentAttrParser on that, but using UTF-8 allows all chars
-                # to be encoded and as a ASCII-superset works.
-                data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
-                parser = inputstream.ContentAttrParser(data)
-                codec = parser.parse()
-                self.parser.tokenizer.stream.changeEncoding(codec)
-
-    def startTagTitle(self, token):
-        self.parser.parseRCDataRawtext(token, "RCDATA")
-
-    def startTagNoScriptNoFramesStyle(self, token):
-        #Need to decide whether to implement the scripting-disabled case
-        self.parser.parseRCDataRawtext(token, "RAWTEXT")
-
-    def startTagScript(self, token):
-        self.tree.insertElement(token)
-        self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
-        self.parser.originalPhase = self.parser.phase
-        self.parser.phase = self.parser.phases["text"]
-
-    def startTagOther(self, token):
-        self.anythingElse()
-        self.parser.phase.processStartTag(token)
-
-    def endTagHead(self, token):
-        node = self.parser.tree.openElements.pop()
-        assert node.name == "head", "Expected head got %s"%node.name
-        self.parser.phase = self.parser.phases["afterHead"]
-
-    def endTagHtmlBodyBr(self, token):
-        self.anythingElse()
-        self.parser.phase.processEndTag(token)
-
-    def endTagOther(self, token):
-        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
-
-    def anythingElse(self):
-        self.endTagHead(impliedTagToken("head"))
-        
-
-# XXX If we implement a parser for which scripting is disabled we need to
-# implement this phase.
-#
-# class InHeadNoScriptPhase(Phase):
-
-class AfterHeadPhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("body", self.startTagBody),
-            ("frameset", self.startTagFrameset),
-            (("base", "link", "meta", "noframes", "script", "style", "title"),
-              self.startTagFromHead),
-            ("head", self.startTagHead)
-        ])
-        self.startTagHandler.default = self.startTagOther
-        self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"), 
-                                                      self.endTagHtmlBodyBr)])
-        self.endTagHandler.default = self.endTagOther
-
-    def processEOF(self):
-        self.anythingElse()
-        self.parser.phase.processEOF()
-
-    def processCharacters(self, token):
-        self.anythingElse()
-        self.parser.phase.processCharacters(token)
-
-    def startTagBody(self, token):
-        self.parser.framesetOK = False
-        self.tree.insertElement(token)
-        self.parser.phase = self.parser.phases["inBody"]
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
 
-    def startTagFrameset(self, token):
-        self.tree.insertElement(token)
-        self.parser.phase = self.parser.phases["inFrameset"]
+        def processSpaceCharacters(self, token):
+            pass
 
-    def startTagFromHead(self, token):
-        self.parser.parseError("unexpected-start-tag-out-of-my-head",
-          {"name": token["name"]})
-        self.tree.openElements.append(self.tree.headPointer)
-        self.parser.phases["inHead"].processStartTag(token)
-        for node in self.tree.openElements[::-1]:
-            if node.name == "head":
-                self.tree.openElements.remove(node)
-                break
+        def processCharacters(self, token):
+            self.insertHtmlElement()
+            return token
 
-    def startTagHead(self, token):
-        self.parser.parseError("unexpected-start-tag", {"name":token["name"]})
-
-    def startTagOther(self, token):
-        self.anythingElse()
-        self.parser.phase.processStartTag(token)
-
-    def endTagHtmlBodyBr(self, token):
-        self.anythingElse()
-        self.parser.phase.processEndTag(token)
-
-    def endTagOther(self, token):
-        self.parser.parseError("unexpected-end-tag", {"name":token["name"]})
-
-    def anythingElse(self):
-        self.tree.insertElement(impliedTagToken("body", "StartTag"))
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.framesetOK = True
-
-
-class InBodyPhase(Phase):
-    # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
-    # the really-really-really-very crazy mode
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-
-        #Keep a ref to this for special handling of whitespace in <pre>
-        self.processSpaceCharactersNonPre = self.processSpaceCharacters
-
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            (("base", "command", "link", "meta", "noframes", "script", "style", 
-              "title"), self.startTagProcessInHead),
-            ("body", self.startTagBody),
-            ("frameset", self.startTagFrameset),
-            (("address", "article", "aside", "blockquote", "center", "datagrid",
-              "details", "dir", "div", "dl", "fieldset", "figure",
-              "footer", "header", "hgroup", "menu", "nav", "ol", "p",
-              "section", "ul"),
-              self.startTagCloseP),
-            (("pre", "listing"), self.startTagPreListing),
-            ("form", self.startTagForm),
-            (("li", "dd", "dt"), self.startTagListItem),
-            ("plaintext",self.startTagPlaintext),
-            (headingElements, self.startTagHeading),
-            ("a", self.startTagA),
-            (("b", "big", "code", "em", "font", "i", "s", "small", "strike", 
-              "strong", "tt", "u"),self.startTagFormatting),
-            ("nobr", self.startTagNobr),
-            ("button", self.startTagButton),
-            (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
-            ("xmp", self.startTagXmp),
-            ("table", self.startTagTable),
-            (("area", "basefont", "bgsound", "br", "embed", "img", "input",
-              "keygen", "spacer", "wbr"), self.startTagVoidFormatting),
-            (("param", "source"), self.startTagParamSource),
-            ("hr", self.startTagHr),
-            ("image", self.startTagImage),
-            ("isindex", self.startTagIsIndex),
-            ("textarea", self.startTagTextarea),
-            ("iframe", self.startTagIFrame),
-            (("noembed", "noframes", "noscript"), self.startTagRawtext),
-            ("select", self.startTagSelect),
-            (("rp", "rt"), self.startTagRpRt),
-            (("option", "optgroup"), self.startTagOpt),
-            (("math"), self.startTagMath),
-            (("svg"), self.startTagSvg),
-            (("caption", "col", "colgroup", "frame", "head",
-              "tbody", "td", "tfoot", "th", "thead",
-              "tr"), self.startTagMisplaced)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([
-            ("body",self.endTagBody),
-            ("html",self.endTagHtml),
-            (("address", "article", "aside", "blockquote", "center", "datagrid",
-              "details", "dir", "div", "dl", "fieldset", "figure",
-              "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", 
-              "section", "ul"), self.endTagBlock),
-            ("form", self.endTagForm),
-            ("p",self.endTagP),
-            (("dd", "dt", "li"), self.endTagListItem),
-            (headingElements, self.endTagHeading),
-            (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
-              "strike", "strong", "tt", "u"), self.endTagFormatting),
-            (("applet", "button", "marquee", "object"), self.endTagAppletButtonMarqueeObject),
-            ("br", self.endTagBr),
-            ])
-        self.endTagHandler.default = self.endTagOther
+        def processStartTag(self, token):
+            if token["name"] == "html":
+                self.parser.firstStartTag = True
+            self.insertHtmlElement()
+            return token
 
-    # helper
-    def addFormattingElement(self, token):
-        self.tree.insertElement(token)
-        self.tree.activeFormattingElements.append(
-            self.tree.openElements[-1])
-
-    # the real deal
-    def processEOF(self):
-        allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
-                                      "tfoot", "th", "thead", "tr", "body",
-                                      "html"))
-        for node in self.tree.openElements[::-1]:
-            if node.name not in allowed_elements:
-                self.parser.parseError("expected-closing-tag-but-got-eof")
-                break
-        #Stop parsing
-    
-    def processSpaceCharactersDropNewline(self, token):
-        # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
-        # want to drop leading newlines
-        data = token["data"]
-        self.processSpaceCharacters = self.processSpaceCharactersNonPre
-        if (data.startswith("\n") and
-            self.tree.openElements[-1].name in ("pre", "listing", "textarea")
-            and not self.tree.openElements[-1].hasContent()):
-            data = data[1:]
-        if data:
-            self.tree.reconstructActiveFormattingElements()
-            self.tree.insertText(data)
+        def processEndTag(self, token):
+            if token["name"] not in ("head", "body", "html", "br"):
+                self.parser.parseError("unexpected-end-tag-before-html",
+                                       {"name": token["name"]})
+            else:
+                self.insertHtmlElement()
+                return token
+
+    class BeforeHeadPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
 
-    def processCharacters(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        self.tree.insertText(token["data"])
-        self.parser.framesetOK = False
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("head", self.startTagHead)
+            ])
+            self.startTagHandler.default = self.startTagOther
 
-    def processSpaceCharacters(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        self.tree.insertText(token["data"])
+            self.endTagHandler = utils.MethodDispatcher([
+                (("head", "body", "html", "br"), self.endTagImplyHead)
+            ])
+            self.endTagHandler.default = self.endTagOther
 
-    def startTagProcessInHead(self, token):
-        self.parser.phases["inHead"].processStartTag(token)
+        def processEOF(self):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return True
 
-    def startTagBody(self, token):
-        self.parser.parseError("unexpected-start-tag", {"name": "body"})
-        if (len(self.tree.openElements) == 1
-            or self.tree.openElements[1].name != "body"):
-            assert self.parser.innerHTML
-        else:
-            for attr, value in token["data"].iteritems():
-                if attr not in self.tree.openElements[1].attributes:
-                    self.tree.openElements[1].attributes[attr] = value
-
-    def startTagFrameset(self, token):
-        self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
-        if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
-            assert self.parser.innerHTML
-        elif not self.parser.framesetOK:
+        def processSpaceCharacters(self, token):
             pass
-        else:
-            if self.tree.openElements[1].parent:
-                self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
-            while self.tree.openElements[-1].name != "html":
-                self.tree.openElements.pop()
-            self.tree.insertElement(token)
-            self.parser.phase = self.parser.phases["inFrameset"]
 
-    def startTagCloseP(self, token):
-        if self.tree.elementInScope("p"):
-            self.endTagP(impliedTagToken("p"))
-        self.tree.insertElement(token)
-    
-    def startTagPreListing(self, token):
-        if self.tree.elementInScope("p"):
-            self.endTagP(impliedTagToken("p"))
-        self.tree.insertElement(token)
-        self.parser.framesetOK = False
-        self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+        def processCharacters(self, token):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return token
 
-    def startTagForm(self, token):
-        if self.tree.formPointer:
-            self.parser.parseError(u"unexpected-start-tag", {"name": "form"})
-        else:
-            if self.tree.elementInScope("p"):
-                self.endTagP("p")
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagHead(self, token):
             self.tree.insertElement(token)
-            self.tree.formPointer = self.tree.openElements[-1]
+            self.tree.headPointer = self.tree.openElements[-1]
+            self.parser.phase = self.parser.phases["inHead"]
+
+        def startTagOther(self, token):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return token
+
+        def endTagImplyHead(self, token):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("end-tag-after-implied-root",
+                                   {"name": token["name"]})
+
+    class InHeadPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("title", self.startTagTitle),
+                (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
+                ("script", self.startTagScript),
+                (("base", "basefont", "bgsound", "command", "link"),
+                 self.startTagBaseLinkCommand),
+                ("meta", self.startTagMeta),
+                ("head", self.startTagHead)
+            ])
+            self.startTagHandler.default = self.startTagOther
 
-    def startTagListItem(self, token):
-        self.parser.framesetOK = False
+            self. endTagHandler = utils.MethodDispatcher([
+                ("head", self.endTagHead),
+                (("br", "html", "body"), self.endTagHtmlBodyBr)
+            ])
+            self.endTagHandler.default = self.endTagOther
 
-        stopNamesMap = {"li":["li"],
-                        "dt":["dt", "dd"],
-                        "dd":["dt", "dd"]}
-        stopNames = stopNamesMap[token["name"]]
-        for node in reversed(self.tree.openElements):
-            if node.name in stopNames:
-                self.parser.phase.processEndTag(
-                    impliedTagToken(node.name, "EndTag"))
-                break
-            if (node.nameTuple in (scopingElements | specialElements) and
-                node.name not in ("address", "div", "p")):
-                break
-            
-        if self.tree.elementInScope("p"):
-            self.parser.phase.processEndTag(
-                impliedTagToken("p", "EndTag"))
+        # the real thing
+        def processEOF(self):
+            self.anythingElse()
+            return True
 
-        self.tree.insertElement(token)
+        def processCharacters(self, token):
+            self.anythingElse()
+            return token
 
-    def startTagPlaintext(self, token):
-        if self.tree.elementInScope("p"):
-            self.endTagP(impliedTagToken("p"))
-        self.tree.insertElement(token)
-        self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
 
-    def startTagHeading(self, token):
-        if self.tree.elementInScope("p"):
-            self.endTagP(impliedTagToken("p"))
-        if self.tree.openElements[-1].name in headingElements:
-            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+        def startTagHead(self, token):
+            self.parser.parseError("two-heads-are-not-better-than-one")
+
+        def startTagBaseLinkCommand(self, token):
+            self.tree.insertElement(token)
             self.tree.openElements.pop()
-        self.tree.insertElement(token)
+            token["selfClosingAcknowledged"] = True
 
-    def startTagA(self, token):
-        afeAElement = self.tree.elementInActiveFormattingElements("a")
-        if afeAElement:
-            self.parser.parseError("unexpected-start-tag-implies-end-tag",
-              {"startName": "a", "endName": "a"})
-            self.endTagFormatting(impliedTagToken("a"))
-            if afeAElement in self.tree.openElements:
-                self.tree.openElements.remove(afeAElement)
-            if afeAElement in self.tree.activeFormattingElements:
-                self.tree.activeFormattingElements.remove(afeAElement)
-        self.tree.reconstructActiveFormattingElements()
-        self.addFormattingElement(token)
-
-    def startTagFormatting(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        self.addFormattingElement(token)
-
-    def startTagNobr(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        if self.tree.elementInScope("nobr"):
-            self.parser.parseError("unexpected-start-tag-implies-end-tag",
-              {"startName": "nobr", "endName": "nobr"})
-            self.processEndTag(impliedTagToken("nobr"))
-            # XXX Need tests that trigger the following
-            self.tree.reconstructActiveFormattingElements()
-        self.addFormattingElement(token)
+        def startTagMeta(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
 
-    def startTagButton(self, token):
-        if self.tree.elementInScope("button"):
-            self.parser.parseError("unexpected-start-tag-implies-end-tag",
-              {"startName": "button", "endName": "button"})
-            self.processEndTag(impliedTagToken("button"))
-            self.parser.phase.processStartTag(token)
-        else:
-            self.tree.reconstructActiveFormattingElements()
+            attributes = token["data"]
+            if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
+                if "charset" in attributes:
+                    self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
+                elif ("content" in attributes and
+                      "http-equiv" in attributes and
+                      attributes["http-equiv"].lower() == "content-type"):
+                    # Encoding it as UTF-8 here is a hack, as really we should pass
+                    # the abstract Unicode string, and just use the
+                    # ContentAttrParser on that, but using UTF-8 allows all chars
+                    # to be encoded and as a ASCII-superset works.
+                    data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
+                    parser = inputstream.ContentAttrParser(data)
+                    codec = parser.parse()
+                    self.parser.tokenizer.stream.changeEncoding(codec)
+
+        def startTagTitle(self, token):
+            self.parser.parseRCDataRawtext(token, "RCDATA")
+
+        def startTagNoScriptNoFramesStyle(self, token):
+            # Need to decide whether to implement the scripting-disabled case
+            self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+        def startTagScript(self, token):
             self.tree.insertElement(token)
-            self.tree.activeFormattingElements.append(Marker)
-            self.parser.framesetOK = False
+            self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
+            self.parser.originalPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["text"]
 
-    def startTagAppletMarqueeObject(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(token)
-        self.tree.activeFormattingElements.append(Marker)
-        self.parser.framesetOK = False
-
-    def startTagXmp(self, token):
-        if self.tree.elementInScope("p"):
-            self.endTagP(impliedTagToken("p"))
-        self.tree.reconstructActiveFormattingElements()
-        self.parser.framesetOK = False
-        self.parser.parseRCDataRawtext(token, "RAWTEXT")
-
-    def startTagTable(self, token):
-        if self.parser.compatMode != "quirks":
-            if self.tree.elementInScope("p"):
-                self.processEndTag(impliedTagToken("p"))
-        self.tree.insertElement(token)
-        self.parser.framesetOK = False
-        self.parser.phase = self.parser.phases["inTable"]
+        def startTagOther(self, token):
+            self.anythingElse()
+            return token
 
-    def startTagVoidFormatting(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(token)
-        self.tree.openElements.pop()
-        token["selfClosingAcknowledged"] = True
-        self.parser.framesetOK = False
+        def endTagHead(self, token):
+            node = self.parser.tree.openElements.pop()
+            assert node.name == "head", "Expected head got %s" % node.name
+            self.parser.phase = self.parser.phases["afterHead"]
 
-    def startTagParamSource(self, token):
-        self.tree.insertElement(token)
-        self.tree.openElements.pop()
-        token["selfClosingAcknowledged"] = True
+        def endTagHtmlBodyBr(self, token):
+            self.anythingElse()
+            return token
 
-    def startTagHr(self, token):
-        if self.tree.elementInScope("p"):
-            self.endTagP(impliedTagToken("p"))
-        self.tree.insertElement(token)
-        self.tree.openElements.pop()
-        token["selfClosingAcknowledged"] = True
-        self.parser.framesetOK = False
-
-    def startTagImage(self, token):
-        # No really...
-        self.parser.parseError("unexpected-start-tag-treated-as",
-          {"originalName": "image", "newName": "img"})
-        self.processStartTag(impliedTagToken("img", "StartTag",
-                                             attributes=token["data"],
-                                             selfClosing=token["selfClosing"]))
-
-    def startTagIsIndex(self, token):
-        self.parser.parseError("deprecated-tag", {"name": "isindex"})
-        if self.tree.formPointer:
-            return
-        form_attrs = {}
-        if "action" in token["data"]:
-            form_attrs["action"] = token["data"]["action"]
-        self.processStartTag(impliedTagToken("form", "StartTag",
-                                             attributes=form_attrs))
-        self.processStartTag(impliedTagToken("hr", "StartTag"))
-        self.processStartTag(impliedTagToken("label", "StartTag"))
-        # XXX Localization ...
-        if "prompt" in token["data"]:
-            prompt = token["data"]["prompt"]
-        else:
-            prompt = "This is a searchable index. Insert your search keywords here: "
-        self.processCharacters(
-            {"type":tokenTypes["Characters"], "data":prompt})
-        attributes = token["data"].copy()
-        if "action" in attributes:
-            del attributes["action"]
-        if "prompt" in attributes:
-            del attributes["prompt"]
-        attributes["name"] = "isindex"
-        self.processStartTag(impliedTagToken("input", "StartTag", 
-                                             attributes = attributes,
-                                             selfClosing = 
-                                             token["selfClosing"]))
-        self.processEndTag(impliedTagToken("label"))
-        self.processStartTag(impliedTagToken("hr", "StartTag"))
-        self.processEndTag(impliedTagToken("form"))
-
-    def startTagTextarea(self, token):
-        self.tree.insertElement(token)
-        self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
-        self.processSpaceCharacters = self.processSpaceCharactersDropNewline
-        self.parser.framesetOK = False
-
-    def startTagIFrame(self, token):
-        self.parser.framesetOK = False
-        self.startTagRawtext(token)
-
-    def startTagRawtext(self, token):
-        """iframe, noembed noframes, noscript(if scripting enabled)"""
-        self.parser.parseRCDataRawtext(token, "RAWTEXT")
-
-    def startTagOpt(self, token):
-        if self.tree.elementInScope("option"):
-            self.parser.phase.processEndTag(impliedTagToken("option"))
-        self.tree.reconstructActiveFormattingElements()
-        self.parser.tree.insertElement(token)
-
-    def startTagSelect(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(token)
-        self.parser.framesetOK = False
-        if self.parser.phase in (self.parser.phases["inTable"],
-                                 self.parser.phases["inCaption"],
-                                 self.parser.phases["inColumnGroup"],
-                                 self.parser.phases["inTableBody"], 
-                                 self.parser.phases["inRow"],
-                                 self.parser.phases["inCell"]):
-            self.parser.phase = self.parser.phases["inSelectInTable"]
-        else:
-            self.parser.phase = self.parser.phases["inSelect"]
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
-    def startTagRpRt(self, token):
-        if self.tree.elementInScope("ruby"):
-            self.tree.generateImpliedEndTags()
-            if self.tree.openElements[-1].name != "ruby":
-                self.parser.parseError()
-                while self.tree.openElements[-1].name != "ruby":
-                    self.tree.openElements.pop()
-        self.tree.insertElement(token)
+        def anythingElse(self):
+            self.endTagHead(impliedTagToken("head"))
+
+    # XXX If we implement a parser for which scripting is disabled we need to
+    # implement this phase.
+    #
+    # class InHeadNoScriptPhase(Phase):
+    class AfterHeadPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("body", self.startTagBody),
+                ("frameset", self.startTagFrameset),
+                (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
+                  "style", "title"),
+                 self.startTagFromHead),
+                ("head", self.startTagHead)
+            ])
+            self.startTagHandler.default = self.startTagOther
+            self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
+                                                          self.endTagHtmlBodyBr)])
+            self.endTagHandler.default = self.endTagOther
 
-    def startTagMath(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        self.parser.adjustMathMLAttributes(token)
-        self.parser.adjustForeignAttributes(token)
-        token["namespace"] = namespaces["mathml"]
-        self.tree.insertElement(token)
-        #Need to get the parse error right for the case where the token 
-        #has a namespace not equal to the xmlns attribute
-        if self.parser.phase != self.parser.phases["inForeignContent"]:
-            self.parser.secondaryPhase = self.parser.phase
-        self.parser.phase = self.parser.phases["inForeignContent"]
-        if token["selfClosing"]:
-            self.tree.openElements.pop()
-            token["selfClosingAcknowledged"] = True
+        def processEOF(self):
+            self.anythingElse()
+            return True
 
-    def startTagSvg(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        self.parser.adjustSVGAttributes(token)
-        self.parser.adjustForeignAttributes(token)
-        token["namespace"] = namespaces["svg"]
-        self.tree.insertElement(token)
-        #Need to get the parse error right for the case where the token 
-        #has a namespace not equal to the xmlns attribute
-        if self.parser.phase != self.parser.phases["inForeignContent"]:
-            self.parser.secondaryPhase = self.parser.phase
-        self.parser.phase = self.parser.phases["inForeignContent"]
-        if token["selfClosing"]:
-            self.tree.openElements.pop()
-            token["selfClosingAcknowledged"] = True
+        def processCharacters(self, token):
+            self.anythingElse()
+            return token
 
-    def startTagMisplaced(self, token):
-        """ Elements that should be children of other elements that have a
-        different insertion mode; here they are ignored
-        "caption", "col", "colgroup", "frame", "frameset", "head",
-        "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
-        "tr", "noscript"
-        """
-        self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
 
-    def startTagOther(self, token):
-        self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(token)
+        def startTagBody(self, token):
+            self.parser.framesetOK = False
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inBody"]
 
-    def endTagP(self, token):
-        if not self.tree.elementInScope("p"):
-            self.startTagCloseP(impliedTagToken("p", "StartTag"))
-            self.parser.parseError("unexpected-end-tag", {"name": "p"})
-            self.endTagP(impliedTagToken("p", "EndTag"))
-        else:
-            self.tree.generateImpliedEndTags("p")
-            if self.tree.openElements[-1].name != "p":
-                self.parser.parseError("unexpected-end-tag", {"name": "p"})
-            node = self.tree.openElements.pop()
-            while node.name != "p":
-                node = self.tree.openElements.pop()
+        def startTagFrameset(self, token):
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inFrameset"]
 
-    def endTagBody(self, token):
-        if not self.tree.elementInScope("body"):
-            self.parser.parseError()
-            return
-        elif self.tree.openElements[-1].name != "body":
-            for node in self.tree.openElements[2:]:
-                if node.name not in frozenset(("dd", "dt", "li", "optgroup",
-                                               "option", "p", "rp", "rt",
-                                               "tbody", "td", "tfoot",
-                                               "th", "thead", "tr", "body",
-                                               "html")):
-                    #Not sure this is the correct name for the parse error
-                    self.parser.parseError(
-                        "expected-one-end-tag-but-got-another",
-                        {"expectedName": "body", "gotName": node.name})
+        def startTagFromHead(self, token):
+            self.parser.parseError("unexpected-start-tag-out-of-my-head",
+                                   {"name": token["name"]})
+            self.tree.openElements.append(self.tree.headPointer)
+            self.parser.phases["inHead"].processStartTag(token)
+            for node in self.tree.openElements[::-1]:
+                if node.name == "head":
+                    self.tree.openElements.remove(node)
                     break
-        self.parser.phase = self.parser.phases["afterBody"]
 
-    def endTagHtml(self, token):
-        #We repeat the test for the body end tag token being ignored here
-        if self.tree.elementInScope("body"):
-            self.endTagBody(impliedTagToken("body"))
-            self.parser.phase.processEndTag(token)
+        def startTagHead(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
 
-    def endTagBlock(self, token):
-        #Put us back in the right whitespace handling mode
-        if token["name"] == "pre":
-            self.processSpaceCharacters = self.processSpaceCharactersNonPre
-        inScope = self.tree.elementInScope(token["name"])
-        if inScope:
-            self.tree.generateImpliedEndTags()
-        if self.tree.openElements[-1].name != token["name"]:
-             self.parser.parseError("end-tag-too-early", {"name": token["name"]})
-        if inScope:
-            node = self.tree.openElements.pop()
-            while node.name != token["name"]:
-                node = self.tree.openElements.pop()
+        def startTagOther(self, token):
+            self.anythingElse()
+            return token
 
-    def endTagForm(self, token):
-        node = self.tree.formPointer
-        self.tree.formPointer = None
-        if node is None or not self.tree.elementInScope(node.name):
-            self.parser.parseError("unexpected-end-tag",
-                                   {"name":"form"})
-        else:
-            self.tree.generateImpliedEndTags()
-            if self.tree.openElements[-1].name != node:
-                self.parser.parseError("end-tag-too-early-ignored",
-                                       {"name": "form"})
-            self.tree.openElements.remove(node)
+        def endTagHtmlBodyBr(self, token):
+            self.anythingElse()
+            return token
 
-    def endTagListItem(self, token):
-        if token["name"] == "li":
-            variant = "list"
-        else:
-            variant = None
-        if not self.tree.elementInScope(token["name"], variant=variant):
+        def endTagOther(self, token):
             self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
-        else:
-            self.tree.generateImpliedEndTags(exclude = token["name"])
-            if self.tree.openElements[-1].name != token["name"]:
-                self.parser.parseError(
-                    "end-tag-too-early",
-                    {"name": token["name"]})
-            node = self.tree.openElements.pop()
-            while node.name != token["name"]:
-                node = self.tree.openElements.pop()
-
-    def endTagHeading(self, token):
-        for item in headingElements:
-            if self.tree.elementInScope(item):
-                self.tree.generateImpliedEndTags()
-                break
-        if self.tree.openElements[-1].name != token["name"]:
-            self.parser.parseError("end-tag-too-early", {"name": token["name"]})
 
-        for item in headingElements:
-            if self.tree.elementInScope(item):
-                item = self.tree.openElements.pop()
-                while item.name not in headingElements:
-                    item = self.tree.openElements.pop()
-                break
-
-    def endTagFormatting(self, token):
-        """The much-feared adoption agency algorithm"""
-        # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
-        # XXX Better parseError messages appreciated.
-        name = token["name"]
-        while True:
-            # Step 1 paragraph 1
-            formattingElement = self.tree.elementInActiveFormattingElements(
-                token["name"])
-            if not formattingElement or (formattingElement in 
-                                        self.tree.openElements and
-                                        not self.tree.elementInScope(
-                    formattingElement.name)):
-                self.parser.parseError("adoption-agency-1.1", {"name": token["name"]})
-                return
-
-            # Step 1 paragraph 2
-            elif formattingElement not in self.tree.openElements:
-                self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
-                self.tree.activeFormattingElements.remove(formattingElement)
-                return
+        def anythingElse(self):
+            self.tree.insertElement(impliedTagToken("body", "StartTag"))
+            self.parser.phase = self.parser.phases["inBody"]
+            self.parser.framesetOK = True
+
+    class InBodyPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
+        # the really-really-really-very crazy mode
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            # Keep a ref to this for special handling of whitespace in <pre>
+            self.processSpaceCharactersNonPre = self.processSpaceCharacters
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("base", "basefont", "bgsound", "command", "link", "meta",
+                  "noframes", "script", "style", "title"),
+                 self.startTagProcessInHead),
+                ("body", self.startTagBody),
+                ("frameset", self.startTagFrameset),
+                (("address", "article", "aside", "blockquote", "center", "details",
+                  "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+                  "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
+                  "section", "summary", "ul"),
+                 self.startTagCloseP),
+                (headingElements, self.startTagHeading),
+                (("pre", "listing"), self.startTagPreListing),
+                ("form", self.startTagForm),
+                (("li", "dd", "dt"), self.startTagListItem),
+                ("plaintext", self.startTagPlaintext),
+                ("a", self.startTagA),
+                (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
+                  "strong", "tt", "u"), self.startTagFormatting),
+                ("nobr", self.startTagNobr),
+                ("button", self.startTagButton),
+                (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
+                ("xmp", self.startTagXmp),
+                ("table", self.startTagTable),
+                (("area", "br", "embed", "img", "keygen", "wbr"),
+                 self.startTagVoidFormatting),
+                (("param", "source", "track"), self.startTagParamSource),
+                ("input", self.startTagInput),
+                ("hr", self.startTagHr),
+                ("image", self.startTagImage),
+                ("isindex", self.startTagIsIndex),
+                ("textarea", self.startTagTextarea),
+                ("iframe", self.startTagIFrame),
+                (("noembed", "noframes", "noscript"), self.startTagRawtext),
+                ("select", self.startTagSelect),
+                (("rp", "rt"), self.startTagRpRt),
+                (("option", "optgroup"), self.startTagOpt),
+                (("math"), self.startTagMath),
+                (("svg"), self.startTagSvg),
+                (("caption", "col", "colgroup", "frame", "head",
+                  "tbody", "td", "tfoot", "th", "thead",
+                  "tr"), self.startTagMisplaced)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = utils.MethodDispatcher([
+                ("body", self.endTagBody),
+                ("html", self.endTagHtml),
+                (("address", "article", "aside", "blockquote", "button", "center",
+                  "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+                  "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
+                  "section", "summary", "ul"), self.endTagBlock),
+                ("form", self.endTagForm),
+                ("p", self.endTagP),
+                (("dd", "dt", "li"), self.endTagListItem),
+                (headingElements, self.endTagHeading),
+                (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
+                  "strike", "strong", "tt", "u"), self.endTagFormatting),
+                (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
+                ("br", self.endTagBr),
+            ])
+            self.endTagHandler.default = self.endTagOther
 
-            # Step 1 paragraph 3
-            if formattingElement != self.tree.openElements[-1]:
-                self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
+        def isMatchingFormattingElement(self, node1, node2):
+            if node1.name != node2.name or node1.namespace != node2.namespace:
+                return False
+            elif len(node1.attributes) != len(node2.attributes):
+                return False
+            else:
+                attributes1 = sorted(node1.attributes.items())
+                attributes2 = sorted(node2.attributes.items())
+                for attr1, attr2 in zip(attributes1, attributes2):
+                    if attr1 != attr2:
+                        return False
+            return True
+
+        # helper
+        def addFormattingElement(self, token):
+            self.tree.insertElement(token)
+            element = self.tree.openElements[-1]
 
-            # Step 2
-            # Start of the adoption agency algorithm proper
-            afeIndex = self.tree.openElements.index(formattingElement)
-            furthestBlock = None
-            for element in self.tree.openElements[afeIndex:]:
-                if (element.nameTuple in
-                    specialElements | scopingElements):
-                    furthestBlock = element
+            matchingElements = []
+            for node in self.tree.activeFormattingElements[::-1]:
+                if node is Marker:
+                    break
+                elif self.isMatchingFormattingElement(node, element):
+                    matchingElements.append(node)
+
+            assert len(matchingElements) <= 3
+            if len(matchingElements) == 3:
+                self.tree.activeFormattingElements.remove(matchingElements[-1])
+            self.tree.activeFormattingElements.append(element)
+
+        # the real deal
+        def processEOF(self):
+            allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
+                                          "tfoot", "th", "thead", "tr", "body",
+                                          "html"))
+            for node in self.tree.openElements[::-1]:
+                if node.name not in allowed_elements:
+                    self.parser.parseError("expected-closing-tag-but-got-eof")
                     break
+            # Stop parsing
 
-            # Step 3
-            if furthestBlock is None:
-                element = self.tree.openElements.pop()
-                while element != formattingElement:
-                    element = self.tree.openElements.pop()
-                self.tree.activeFormattingElements.remove(element)
+        def processSpaceCharactersDropNewline(self, token):
+            # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
+            # want to drop leading newlines
+            data = token["data"]
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
+            if (data.startswith("\n") and
+                self.tree.openElements[-1].name in ("pre", "listing", "textarea")
+                    and not self.tree.openElements[-1].hasContent()):
+                data = data[1:]
+            if data:
+                self.tree.reconstructActiveFormattingElements()
+                self.tree.insertText(data)
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                # The tokenizer should always emit null on its own
                 return
-            commonAncestor = self.tree.openElements[afeIndex-1]
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertText(token["data"])
+            # This must be bad for performance
+            if (self.parser.framesetOK and
+                any([char not in spaceCharacters
+                     for char in token["data"]])):
+                self.parser.framesetOK = False
+
+        def processSpaceCharacters(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertText(token["data"])
 
-            # Step 5
-            #if furthestBlock.parent:
-            #    furthestBlock.parent.removeChild(furthestBlock)
+        def startTagProcessInHead(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
 
-            # Step 5
-            # The bookmark is supposed to help us identify where to reinsert
-            # nodes in step 12. We have to ensure that we reinsert nodes after
-            # the node before the active formatting element. Note the bookmark
-            # can move in step 7.4
-            bookmark = self.tree.activeFormattingElements.index(formattingElement)
+        def startTagBody(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": "body"})
+            if (len(self.tree.openElements) == 1
+                    or self.tree.openElements[1].name != "body"):
+                assert self.parser.innerHTML
+            else:
+                self.parser.framesetOK = False
+                for attr, value in token["data"].items():
+                    if attr not in self.tree.openElements[1].attributes:
+                        self.tree.openElements[1].attributes[attr] = value
+
+        def startTagFrameset(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
+            if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
+                assert self.parser.innerHTML
+            elif not self.parser.framesetOK:
+                pass
+            else:
+                if self.tree.openElements[1].parent:
+                    self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
+                while self.tree.openElements[-1].name != "html":
+                    self.tree.openElements.pop()
+                self.tree.insertElement(token)
+                self.parser.phase = self.parser.phases["inFrameset"]
 
-            # Step 6
-            lastNode = node = furthestBlock
-            while True:
-                # AT replace this with a function and recursion?
-                # Node is element before node in open elements
-                node = self.tree.openElements[
-                    self.tree.openElements.index(node)-1]
-                while node not in self.tree.activeFormattingElements:
-                    tmpNode = node
-                    node = self.tree.openElements[
-                        self.tree.openElements.index(node)-1]
-                    self.tree.openElements.remove(tmpNode)
-                # Step 6.3
-                if node == formattingElement:
-                    break
-                # Step 6.4
-                if lastNode == furthestBlock:
-                    bookmark = (self.tree.activeFormattingElements.index(node)
-                                + 1)
-                # Step 6.5
-                #cite = node.parent
-                #if node.hasContent():
-                clone = node.cloneNode()
-                # Replace node with clone
-                self.tree.activeFormattingElements[
-                    self.tree.activeFormattingElements.index(node)] = clone
-                self.tree.openElements[
-                    self.tree.openElements.index(node)] = clone
-                node = clone
-                
-                # Step 6.6
-                # Remove lastNode from its parents, if any
-                if lastNode.parent:
-                    lastNode.parent.removeChild(lastNode)
-                node.appendChild(lastNode)
-                # Step 7.7
-                lastNode = node
-                # End of inner loop 
-
-            # Step 7
-            # Foster parent lastNode if commonAncestor is a
-            # table, tbody, tfoot, thead, or tr we need to foster parent the 
-            # lastNode
-            if lastNode.parent:
-                lastNode.parent.removeChild(lastNode)
-            commonAncestor.appendChild(lastNode)
-
-            # Step 8
-            clone = formattingElement.cloneNode()
-
-            # Step 9
-            furthestBlock.reparentChildren(clone)
-
-            # Step 10
-            furthestBlock.appendChild(clone)
-
-            # Step 11
-            self.tree.activeFormattingElements.remove(formattingElement)
-            self.tree.activeFormattingElements.insert(bookmark, clone)
-
-            # Step 12
-            self.tree.openElements.remove(formattingElement)
-            self.tree.openElements.insert(
-              self.tree.openElements.index(furthestBlock) + 1, clone)
-
-    def endTagAppletButtonMarqueeObject(self, token):
-        if self.tree.elementInScope(token["name"]):
-            self.tree.generateImpliedEndTags()
-        if self.tree.openElements[-1].name != token["name"]:
-            self.parser.parseError("end-tag-too-early", {"name": token["name"]})
-
-        if self.tree.elementInScope(token["name"]):
-            element = self.tree.openElements.pop()
-            while element.name != token["name"]:
-                element = self.tree.openElements.pop()
-            self.tree.clearActiveFormattingElements()
+        def startTagCloseP(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
 
-    def endTagBr(self, token):
-        self.parser.parseError("unexpected-end-tag-treated-as",
-          {"originalName": "br", "newName": "br element"})
-        self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(impliedTagToken("br", "StartTag"))
-        self.tree.openElements.pop()
+        def startTagPreListing(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.parser.framesetOK = False
+            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
 
-    def endTagOther(self, token):
-        for node in self.tree.openElements[::-1]:
-            if node.name == token["name"]:
-                self.tree.generateImpliedEndTags()
-                if self.tree.openElements[-1].name != token["name"]:
-                    self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
-                while self.tree.openElements.pop() != node:
-                    pass
-                break
+        def startTagForm(self, token):
+            if self.tree.formPointer:
+                self.parser.parseError("unexpected-start-tag", {"name": "form"})
             else:
-                if (node.nameTuple in
-                    specialElements | scopingElements):
-                    self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
-                    break
-
-class TextPhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-        self.startTagHandler = utils.MethodDispatcher([])
-        self.startTagHandler.default = self.startTagOther
-        self.endTagHandler = utils.MethodDispatcher([
-                ("script", self.endTagScript)])
-        self.endTagHandler.default = self.endTagOther
-
-    def processCharacters(self, token):
-        self.tree.insertText(token["data"])
-    
-    def processEOF(self):
-        self.parser.parseError("expected-named-closing-tag-but-got-eof", 
-                               self.tree.openElements[-1].name)
-        self.tree.openElements.pop()
-        self.parser.phase = self.parser.originalPhase
-        self.parser.phase.processEOF()
-
-    def startTagOther(self, token):
-        assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode"%name
-
-    def endTagScript(self, token):
-        node = self.tree.openElements.pop()
-        assert node.name == "script"
-        self.parser.phase = self.parser.originalPhase
-        #The rest of this method is all stuff that only happens if
-        #document.write works
-    
-    def endTagOther(self, token):
-        node = self.tree.openElements.pop()
-        self.parser.phase = self.parser.originalPhase
-
-class InTablePhase(Phase):
-    # http://www.whatwg.org/specs/web-apps/current-work/#in-table
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("caption", self.startTagCaption),
-            ("colgroup", self.startTagColgroup),
-            ("col", self.startTagCol),
-            (("tbody", "tfoot", "thead"), self.startTagRowGroup),
-            (("td", "th", "tr"), self.startTagImplyTbody),
-            ("table", self.startTagTable),
-            (("style", "script"), self.startTagStyleScript),
-            ("input", self.startTagInput),
-            ("form", self.startTagForm)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([
-            ("table", self.endTagTable),
-            (("body", "caption", "col", "colgroup", "html", "tbody", "td",
-              "tfoot", "th", "thead", "tr"), self.endTagIgnore)
-        ])
-        self.endTagHandler.default = self.endTagOther
-
-    # helper methods
-    def clearStackToTableContext(self):
-        # "clear the stack back to a table context"
-        while self.tree.openElements[-1].name not in ("table", "html"):
-            #self.parser.parseError("unexpected-implied-end-tag-in-table",
-            #  {"name":  self.tree.openElements[-1].name})
-            self.tree.openElements.pop()
-        # When the current node is <html> it's an innerHTML case
-
-    def getCurrentTable(self):
-        i = -1
-        while -i <= len(self.tree.openElements) and self.tree.openElements[i].name != "table":
-             i -= 1
-        if -i > len(self.tree.openElements):
-            return self.tree.openElements[0]
-        else:
-            return self.tree.openElements[i]
+                if self.tree.elementInScope("p", variant="button"):
+                    self.endTagP(impliedTagToken("p"))
+                self.tree.insertElement(token)
+                self.tree.formPointer = self.tree.openElements[-1]
 
-    # processing methods
-    def processEOF(self):
-        if self.tree.openElements[-1].name != "html":
-            self.parser.parseError("eof-in-table")
-        else:
-            assert self.parser.innerHTML
-        #Stop parsing
-
-    def processSpaceCharacters(self, token):
-        originalPhase = self.parser.phase
-        self.parser.phase = self.parser.phases["inTableText"]
-        self.parser.phase.originalPhase = originalPhase
-        self.parser.phase.characterTokens.append(token)
-
-    def processCharacters(self, token):
-        #If we get here there must be at least one non-whitespace character
-        # Do the table magic!
-        self.tree.insertFromTable = True
-        self.parser.phases["inBody"].processCharacters(token)
-        self.tree.insertFromTable = False
-
-    def startTagCaption(self, token):
-        self.clearStackToTableContext()
-        self.tree.activeFormattingElements.append(Marker)
-        self.tree.insertElement(token)
-        self.parser.phase = self.parser.phases["inCaption"]
+        def startTagListItem(self, token):
+            self.parser.framesetOK = False
 
-    def startTagColgroup(self, token):
-        self.clearStackToTableContext()
-        self.tree.insertElement(token)
-        self.parser.phase = self.parser.phases["inColumnGroup"]
+            stopNamesMap = {"li": ["li"],
+                            "dt": ["dt", "dd"],
+                            "dd": ["dt", "dd"]}
+            stopNames = stopNamesMap[token["name"]]
+            for node in reversed(self.tree.openElements):
+                if node.name in stopNames:
+                    self.parser.phase.processEndTag(
+                        impliedTagToken(node.name, "EndTag"))
+                    break
+                if (node.nameTuple in specialElements and
+                        node.name not in ("address", "div", "p")):
+                    break
 
-    def startTagCol(self, token):
-        self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
-        self.parser.phase.processStartTag(token)
+            if self.tree.elementInScope("p", variant="button"):
+                self.parser.phase.processEndTag(
+                    impliedTagToken("p", "EndTag"))
 
-    def startTagRowGroup(self, token):
-        self.clearStackToTableContext()
-        self.tree.insertElement(token)
-        self.parser.phase = self.parser.phases["inTableBody"]
-
-    def startTagImplyTbody(self, token):
-        self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
-        self.parser.phase.processStartTag(token)
-
-    def startTagTable(self, token):
-        self.parser.parseError("unexpected-start-tag-implies-end-tag",
-          {"startName": "table", "endName": "table"})
-        self.parser.phase.processEndTag(impliedTagToken("table"))
-        if not self.parser.innerHTML:
-            self.parser.phase.processStartTag(token)
-
-    def startTagStyleScript(self, token):
-        self.parser.phases["inHead"].processStartTag(token)
-
-    def startTagInput(self, token):
-        if ("type" in token["data"] and 
-            token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
-            self.parser.parseError("unexpected-hidden-input-in-table")
             self.tree.insertElement(token)
-            # XXX associate with form
-            self.tree.openElements.pop()
-        else:
-            self.startTagOther(token)
 
-    def startTagForm(self, token):
-        self.parser.parseError("unexpected-form-in-table")
-        self.tree.insertElement(token)
-        self.tree.openElements.pop()
-
-    def startTagOther(self, token):
-        self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
-        if "tainted" not in self.getCurrentTable()._flags:
-            self.getCurrentTable()._flags.append("tainted")
-        # Do the table magic!
-        self.tree.insertFromTable = True
-        self.parser.phases["inBody"].processStartTag(token)
-        self.tree.insertFromTable = False
-
-    def endTagTable(self, token):
-        if self.tree.elementInScope("table", variant="table"):
-            self.tree.generateImpliedEndTags()
-            if self.tree.openElements[-1].name != "table":
-                self.parser.parseError("end-tag-too-early-named",
-                  {"gotName": "table",
-                   "expectedName": self.tree.openElements[-1].name})
-            while self.tree.openElements[-1].name != "table":
-                self.tree.openElements.pop()
-            self.tree.openElements.pop()
-            self.parser.resetInsertionMode()
-        else:
-            # innerHTML case
-            assert self.parser.innerHTML
-            self.parser.parseError()
+        def startTagPlaintext(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
 
-    def endTagIgnore(self, token):
-        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
-
-    def endTagOther(self, token):
-        self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
-        if "tainted" not in self.getCurrentTable()._flags:
-            self.getCurrentTable()._flags.append("tainted")
-        # Do the table magic!
-        self.tree.insertFromTable = True
-        self.parser.phases["inBody"].processEndTag(token)
-        self.tree.insertFromTable = False
-
-class InTableTextPhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-        self.originalPhase = None
-        self.characterTokens = []
-
-    def flushCharacters(self):
-        data = "".join([item["data"] for item in self.characterTokens])
-        if any([item not in spaceCharacters for item in data]):
-            token = {"type":tokenTypes["Characters"], "data":data}
-            self.originalPhase.processCharacters(token)
-        elif data:
-            self.tree.insertText(data)
-        self.characterTokens = []
-
-    def processComment(self, token):
-        self.flushCharacters()
-        self.phase = self.originalPhase
-        self.phase.processComment(token)
-
-    def processEOF(self):
-        self.flushCharacters()
-        self.phase = self.originalPhase
-        self.phase.processEOF()
-
-    def processCharacters(self, token):
-        self.characterTokens.append(token)
-
-    def processSpaceCharacters(self, token):
-        #pretty sure we should never reach here
-        self.characterTokens.append(token)
-#        assert False
-
-    def processStartTag(self, token):        
-        self.flushCharacters()
-        self.phase = self.originalPhase
-        self.phase.processStartTag(token)
-
-    def processEndTag(self, token):
-        self.flushCharacters()
-        self.phase = self.originalPhase
-        self.phase.processEndTag(token)
-    
-
-class InCaptionPhase(Phase):
-    # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
-              "thead", "tr"), self.startTagTableElement)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([
-            ("caption", self.endTagCaption),
-            ("table", self.endTagTable),
-            (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
-              "thead", "tr"), self.endTagIgnore)
-        ])
-        self.endTagHandler.default = self.endTagOther
-
-    def ignoreEndTagCaption(self):
-        return not self.tree.elementInScope("caption", variant="table")
-
-    def processEOF(self):
-        self.parser.phases["inBody"].processEOF()
-
-    def processCharacters(self, token):
-        self.parser.phases["inBody"].processCharacters(token)
-
-    def startTagTableElement(self, token):
-        self.parser.parseError()
-        #XXX Have to duplicate logic here to find out if the tag is ignored
-        ignoreEndTag = self.ignoreEndTagCaption()
-        self.parser.phase.processEndTag(impliedTagToken("caption"))
-        if not ignoreEndTag:
-            self.parser.phase.processStartTag(token)
-
-    def startTagOther(self, token):
-        self.parser.phases["inBody"].processStartTag(token)
-
-    def endTagCaption(self, token):
-        if not self.ignoreEndTagCaption():
-            # AT this code is quite similar to endTagTable in "InTable"
-            self.tree.generateImpliedEndTags()
-            if self.tree.openElements[-1].name != "caption":
-                self.parser.parseError("expected-one-end-tag-but-got-another",
-                  {"gotName": "caption",
-                   "expectedName": self.tree.openElements[-1].name})
-            while self.tree.openElements[-1].name != "caption":
+        def startTagHeading(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            if self.tree.openElements[-1].name in headingElements:
+                self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
                 self.tree.openElements.pop()
-            self.tree.openElements.pop()
-            self.tree.clearActiveFormattingElements()
-            self.parser.phase = self.parser.phases["inTable"]
-        else:
-            # innerHTML case
-            assert self.parser.innerHTML
-            self.parser.parseError()
-
-    def endTagTable(self, token):
-        self.parser.parseError()
-        ignoreEndTag = self.ignoreEndTagCaption()
-        self.parser.phase.processEndTag(impliedTagToken("caption"))
-        if not ignoreEndTag:
-            self.parser.phase.processEndTag(token)
-
-    def endTagIgnore(self, token):
-        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
-
-    def endTagOther(self, token):
-        self.parser.phases["inBody"].processEndTag(token)
+            self.tree.insertElement(token)
 
+        def startTagA(self, token):
+            afeAElement = self.tree.elementInActiveFormattingElements("a")
+            if afeAElement:
+                self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                       {"startName": "a", "endName": "a"})
+                self.endTagFormatting(impliedTagToken("a"))
+                if afeAElement in self.tree.openElements:
+                    self.tree.openElements.remove(afeAElement)
+                if afeAElement in self.tree.activeFormattingElements:
+                    self.tree.activeFormattingElements.remove(afeAElement)
+            self.tree.reconstructActiveFormattingElements()
+            self.addFormattingElement(token)
 
-class InColumnGroupPhase(Phase):
-    # http://www.whatwg.org/specs/web-apps/current-work/#in-column
+        def startTagFormatting(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.addFormattingElement(token)
 
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
+        def startTagNobr(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            if self.tree.elementInScope("nobr"):
+                self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                       {"startName": "nobr", "endName": "nobr"})
+                self.processEndTag(impliedTagToken("nobr"))
+                # XXX Need tests that trigger the following
+                self.tree.reconstructActiveFormattingElements()
+            self.addFormattingElement(token)
+
+        def startTagButton(self, token):
+            if self.tree.elementInScope("button"):
+                self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                       {"startName": "button", "endName": "button"})
+                self.processEndTag(impliedTagToken("button"))
+                return token
+            else:
+                self.tree.reconstructActiveFormattingElements()
+                self.tree.insertElement(token)
+                self.parser.framesetOK = False
 
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("col", self.startTagCol)
-        ])
-        self.startTagHandler.default = self.startTagOther
+        def startTagAppletMarqueeObject(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+            self.tree.activeFormattingElements.append(Marker)
+            self.parser.framesetOK = False
 
-        self.endTagHandler = utils.MethodDispatcher([
-            ("colgroup", self.endTagColgroup),
-            ("col", self.endTagCol)
-        ])
-        self.endTagHandler.default = self.endTagOther
+        def startTagXmp(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.framesetOK = False
+            self.parser.parseRCDataRawtext(token, "RAWTEXT")
 
-    def ignoreEndTagColgroup(self):
-        return self.tree.openElements[-1].name == "html"
+        def startTagTable(self, token):
+            if self.parser.compatMode != "quirks":
+                if self.tree.elementInScope("p", variant="button"):
+                    self.processEndTag(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.parser.framesetOK = False
+            self.parser.phase = self.parser.phases["inTable"]
 
-    def processEOF(self):
-        if self.tree.openElements[-1].name == "html":
-            assert self.parser.innerHTML
-            return
-        else:
-            ignoreEndTag = self.ignoreEndTagColgroup()
-            self.endTagColgroup("colgroup")
-            if not ignoreEndTag:
-                self.parser.phase.processEOF()
+        def startTagVoidFormatting(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+            self.parser.framesetOK = False
 
-    def processCharacters(self, token):
-        ignoreEndTag = self.ignoreEndTagColgroup()
-        self.endTagColgroup(impliedTagToken("colgroup"))
-        if not ignoreEndTag:
-            self.parser.phase.processCharacters(token)
+        def startTagInput(self, token):
+            framesetOK = self.parser.framesetOK
+            self.startTagVoidFormatting(token)
+            if ("type" in token["data"] and
+                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
+                # input type=hidden doesn't change framesetOK
+                self.parser.framesetOK = framesetOK
 
-    def startTagCol(self, token):
-        self.tree.insertElement(token)
-        self.tree.openElements.pop()
-
-    def startTagOther(self, token):
-        ignoreEndTag = self.ignoreEndTagColgroup()
-        self.endTagColgroup("colgroup")
-        if not ignoreEndTag:
-            self.parser.phase.processStartTag(token)
-
-    def endTagColgroup(self, token):
-        if self.ignoreEndTagColgroup():
-            # innerHTML case
-            assert self.parser.innerHTML
-            self.parser.parseError()
-        else:
+        def startTagParamSource(self, token):
+            self.tree.insertElement(token)
             self.tree.openElements.pop()
-            self.parser.phase = self.parser.phases["inTable"]
+            token["selfClosingAcknowledged"] = True
 
-    def endTagCol(self, token):
-        self.parser.parseError("no-end-tag", {"name": "col"})
-
-    def endTagOther(self, token):
-        ignoreEndTag = self.ignoreEndTagColgroup()
-        self.endTagColgroup("colgroup")
-        if not ignoreEndTag:
-            self.parser.phase.processEndTag(token)
-
-
-class InTableBodyPhase(Phase):
-    # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("tr", self.startTagTr),
-            (("td", "th"), self.startTagTableCell),
-            (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
-             self.startTagTableOther)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([
-            (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
-            ("table", self.endTagTable),
-            (("body", "caption", "col", "colgroup", "html", "td", "th",
-              "tr"), self.endTagIgnore)
-        ])
-        self.endTagHandler.default = self.endTagOther
-
-    # helper methods
-    def clearStackToTableBodyContext(self):
-        while self.tree.openElements[-1].name not in ("tbody", "tfoot",
-          "thead", "html"):
-            #self.parser.parseError("unexpected-implied-end-tag-in-table",
-            #  {"name": self.tree.openElements[-1].name})
+        def startTagHr(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
             self.tree.openElements.pop()
-        if self.tree.openElements[-1].name == "html":
-            assert self.parser.innerHTML
+            token["selfClosingAcknowledged"] = True
+            self.parser.framesetOK = False
 
-    # the rest
-    def processEOF(self):
-        self.parser.phases["inTable"].processEOF()
-    
-    def processSpaceCharacters(self, token):
-        self.parser.phases["inTable"].processSpaceCharacters(token)
+        def startTagImage(self, token):
+            # No really...
+            self.parser.parseError("unexpected-start-tag-treated-as",
+                                   {"originalName": "image", "newName": "img"})
+            self.processStartTag(impliedTagToken("img", "StartTag",
+                                                 attributes=token["data"],
+                                                 selfClosing=token["selfClosing"]))
+
+        def startTagIsIndex(self, token):
+            self.parser.parseError("deprecated-tag", {"name": "isindex"})
+            if self.tree.formPointer:
+                return
+            form_attrs = {}
+            if "action" in token["data"]:
+                form_attrs["action"] = token["data"]["action"]
+            self.processStartTag(impliedTagToken("form", "StartTag",
+                                                 attributes=form_attrs))
+            self.processStartTag(impliedTagToken("hr", "StartTag"))
+            self.processStartTag(impliedTagToken("label", "StartTag"))
+            # XXX Localization ...
+            if "prompt" in token["data"]:
+                prompt = token["data"]["prompt"]
+            else:
+                prompt = "This is a searchable index. Enter search keywords: "
+            self.processCharacters(
+                {"type": tokenTypes["Characters"], "data": prompt})
+            attributes = token["data"].copy()
+            if "action" in attributes:
+                del attributes["action"]
+            if "prompt" in attributes:
+                del attributes["prompt"]
+            attributes["name"] = "isindex"
+            self.processStartTag(impliedTagToken("input", "StartTag",
+                                                 attributes=attributes,
+                                                 selfClosing=
+                                                 token["selfClosing"]))
+            self.processEndTag(impliedTagToken("label"))
+            self.processStartTag(impliedTagToken("hr", "StartTag"))
+            self.processEndTag(impliedTagToken("form"))
+
+        def startTagTextarea(self, token):
+            self.tree.insertElement(token)
+            self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
+            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+            self.parser.framesetOK = False
 
-    def processCharacters(self, token):
-        self.parser.phases["inTable"].processCharacters(token)
+        def startTagIFrame(self, token):
+            self.parser.framesetOK = False
+            self.startTagRawtext(token)
 
-    def startTagTr(self, token):
-        self.clearStackToTableBodyContext()
-        self.tree.insertElement(token)
-        self.parser.phase = self.parser.phases["inRow"]
-
-    def startTagTableCell(self, token):
-        self.parser.parseError("unexpected-cell-in-table-body", 
-                               {"name": token["name"]})
-        self.startTagTr(impliedTagToken("tr", "StartTag"))
-        self.parser.phase.processStartTag(token)
-
-    def startTagTableOther(self, token):
-        # XXX AT Any ideas on how to share this with endTagTable?
-        if (self.tree.elementInScope("tbody", variant="table") or
-            self.tree.elementInScope("thead", variant="table") or
-            self.tree.elementInScope("tfoot", variant="table")):
-            self.clearStackToTableBodyContext()
-            self.endTagTableRowGroup(
-                impliedTagToken(self.tree.openElements[-1].name))
-            self.parser.phase.processStartTag(token)
-        else:
-            # innerHTML case
-            self.parser.parseError()
+        def startTagRawtext(self, token):
+            """iframe, noembed noframes, noscript(if scripting enabled)"""
+            self.parser.parseRCDataRawtext(token, "RAWTEXT")
 
-    def startTagOther(self, token):
-        self.parser.phases["inTable"].processStartTag(token)
+        def startTagOpt(self, token):
+            if self.tree.openElements[-1].name == "option":
+                self.parser.phase.processEndTag(impliedTagToken("option"))
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.tree.insertElement(token)
 
-    def endTagTableRowGroup(self, token):
-        if self.tree.elementInScope(token["name"], variant="table"):
-            self.clearStackToTableBodyContext()
-            self.tree.openElements.pop()
-            self.parser.phase = self.parser.phases["inTable"]
-        else:
-            self.parser.parseError("unexpected-end-tag-in-table-body",
-              {"name": token["name"]})
+        def startTagSelect(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+            self.parser.framesetOK = False
+            if self.parser.phase in (self.parser.phases["inTable"],
+                                     self.parser.phases["inCaption"],
+                                     self.parser.phases["inColumnGroup"],
+                                     self.parser.phases["inTableBody"],
+                                     self.parser.phases["inRow"],
+                                     self.parser.phases["inCell"]):
+                self.parser.phase = self.parser.phases["inSelectInTable"]
+            else:
+                self.parser.phase = self.parser.phases["inSelect"]
 
-    def endTagTable(self, token):
-        if (self.tree.elementInScope("tbody", variant="table") or
-            self.tree.elementInScope("thead", variant="table") or
-            self.tree.elementInScope("tfoot", variant="table")):
-            self.clearStackToTableBodyContext()
-            self.endTagTableRowGroup(
-                impliedTagToken(self.tree.openElements[-1].name))
-            self.parser.phase.processEndTag(token)
-        else:
-            # innerHTML case
-            self.parser.parseError()
+        def startTagRpRt(self, token):
+            if self.tree.elementInScope("ruby"):
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != "ruby":
+                    self.parser.parseError()
+            self.tree.insertElement(token)
+
+        def startTagMath(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.adjustMathMLAttributes(token)
+            self.parser.adjustForeignAttributes(token)
+            token["namespace"] = namespaces["mathml"]
+            self.tree.insertElement(token)
+            # Need to get the parse error right for the case where the token
+            # has a namespace not equal to the xmlns attribute
+            if token["selfClosing"]:
+                self.tree.openElements.pop()
+                token["selfClosingAcknowledged"] = True
+
+        def startTagSvg(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.adjustSVGAttributes(token)
+            self.parser.adjustForeignAttributes(token)
+            token["namespace"] = namespaces["svg"]
+            self.tree.insertElement(token)
+            # Need to get the parse error right for the case where the token
+            # has a namespace not equal to the xmlns attribute
+            if token["selfClosing"]:
+                self.tree.openElements.pop()
+                token["selfClosingAcknowledged"] = True
+
+        def startTagMisplaced(self, token):
+            """ Elements that should be children of other elements that have a
+            different insertion mode; here they are ignored
+            "caption", "col", "colgroup", "frame", "frameset", "head",
+            "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+            "tr", "noscript"
+            """
+            self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
+
+        def startTagOther(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+
+        def endTagP(self, token):
+            if not self.tree.elementInScope("p", variant="button"):
+                self.startTagCloseP(impliedTagToken("p", "StartTag"))
+                self.parser.parseError("unexpected-end-tag", {"name": "p"})
+                self.endTagP(impliedTagToken("p", "EndTag"))
+            else:
+                self.tree.generateImpliedEndTags("p")
+                if self.tree.openElements[-1].name != "p":
+                    self.parser.parseError("unexpected-end-tag", {"name": "p"})
+                node = self.tree.openElements.pop()
+                while node.name != "p":
+                    node = self.tree.openElements.pop()
+
+        def endTagBody(self, token):
+            if not self.tree.elementInScope("body"):
+                self.parser.parseError()
+                return
+            elif self.tree.openElements[-1].name != "body":
+                for node in self.tree.openElements[2:]:
+                    if node.name not in frozenset(("dd", "dt", "li", "optgroup",
+                                                   "option", "p", "rp", "rt",
+                                                   "tbody", "td", "tfoot",
+                                                   "th", "thead", "tr", "body",
+                                                   "html")):
+                        # Not sure this is the correct name for the parse error
+                        self.parser.parseError(
+                            "expected-one-end-tag-but-got-another",
+                            {"expectedName": "body", "gotName": node.name})
+                        break
+            self.parser.phase = self.parser.phases["afterBody"]
+
+        def endTagHtml(self, token):
+            # We repeat the test for the body end tag token being ignored here
+            if self.tree.elementInScope("body"):
+                self.endTagBody(impliedTagToken("body"))
+                return token
+
+        def endTagBlock(self, token):
+            # Put us back in the right whitespace handling mode
+            if token["name"] == "pre":
+                self.processSpaceCharacters = self.processSpaceCharactersNonPre
+            inScope = self.tree.elementInScope(token["name"])
+            if inScope:
+                self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+            if inScope:
+                node = self.tree.openElements.pop()
+                while node.name != token["name"]:
+                    node = self.tree.openElements.pop()
+
+        def endTagForm(self, token):
+            node = self.tree.formPointer
+            self.tree.formPointer = None
+            if node is None or not self.tree.elementInScope(node):
+                self.parser.parseError("unexpected-end-tag",
+                                       {"name": "form"})
+            else:
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1] != node:
+                    self.parser.parseError("end-tag-too-early-ignored",
+                                           {"name": "form"})
+                self.tree.openElements.remove(node)
+
+        def endTagListItem(self, token):
+            if token["name"] == "li":
+                variant = "list"
+            else:
+                variant = None
+            if not self.tree.elementInScope(token["name"], variant=variant):
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+            else:
+                self.tree.generateImpliedEndTags(exclude=token["name"])
+                if self.tree.openElements[-1].name != token["name"]:
+                    self.parser.parseError(
+                        "end-tag-too-early",
+                        {"name": token["name"]})
+                node = self.tree.openElements.pop()
+                while node.name != token["name"]:
+                    node = self.tree.openElements.pop()
+
+        def endTagHeading(self, token):
+            for item in headingElements:
+                if self.tree.elementInScope(item):
+                    self.tree.generateImpliedEndTags()
+                    break
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+
+            for item in headingElements:
+                if self.tree.elementInScope(item):
+                    item = self.tree.openElements.pop()
+                    while item.name not in headingElements:
+                        item = self.tree.openElements.pop()
+                    break
+
+        def endTagFormatting(self, token):
+            """The much-feared adoption agency algorithm"""
+            # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
+            # XXX Better parseError messages appreciated.
+
+            # Step 1
+            outerLoopCounter = 0
+
+            # Step 2
+            while outerLoopCounter < 8:
+
+                # Step 3
+                outerLoopCounter += 1
+
+                # Step 4:
+
+                # Let the formatting element be the last element in
+                # the list of active formatting elements that:
+                # - is between the end of the list and the last scope
+                # marker in the list, if any, or the start of the list
+                # otherwise, and
+                # - has the same tag name as the token.
+                formattingElement = self.tree.elementInActiveFormattingElements(
+                    token["name"])
+                if (not formattingElement or
+                    (formattingElement in self.tree.openElements and
+                     not self.tree.elementInScope(formattingElement.name))):
+                    # If there is no such node, then abort these steps
+                    # and instead act as described in the "any other
+                    # end tag" entry below.
+                    self.endTagOther(token)
+                    return
+
+                # Otherwise, if there is such a node, but that node is
+                # not in the stack of open elements, then this is a
+                # parse error; remove the element from the list, and
+                # abort these steps.
+                elif formattingElement not in self.tree.openElements:
+                    self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
+                    self.tree.activeFormattingElements.remove(formattingElement)
+                    return
+
+                # Otherwise, if there is such a node, and that node is
+                # also in the stack of open elements, but the element
+                # is not in scope, then this is a parse error; ignore
+                # the token, and abort these steps.
+                elif not self.tree.elementInScope(formattingElement.name):
+                    self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
+                    return
+
+                # Otherwise, there is a formatting element and that
+                # element is in the stack and is in scope. If the
+                # element is not the current node, this is a parse
+                # error. In any case, proceed with the algorithm as
+                # written in the following steps.
+                else:
+                    if formattingElement != self.tree.openElements[-1]:
+                        self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
+
+                # Step 5:
+
+                # Let the furthest block be the topmost node in the
+                # stack of open elements that is lower in the stack
+                # than the formatting element, and is an element in
+                # the special category. There might not be one.
+                afeIndex = self.tree.openElements.index(formattingElement)
+                furthestBlock = None
+                for element in self.tree.openElements[afeIndex:]:
+                    if element.nameTuple in specialElements:
+                        furthestBlock = element
+                        break
+
+                # Step 6:
+
+                # If there is no furthest block, then the UA must
+                # first pop all the nodes from the bottom of the stack
+                # of open elements, from the current node up to and
+                # including the formatting element, then remove the
+                # formatting element from the list of active
+                # formatting elements, and finally abort these steps.
+                if furthestBlock is None:
+                    element = self.tree.openElements.pop()
+                    while element != formattingElement:
+                        element = self.tree.openElements.pop()
+                    self.tree.activeFormattingElements.remove(element)
+                    return
+
+                # Step 7
+                commonAncestor = self.tree.openElements[afeIndex - 1]
+
+                # Step 8:
+                # The bookmark is supposed to help us identify where to reinsert
+                # nodes in step 15. We have to ensure that we reinsert nodes after
+                # the node before the active formatting element. Note the bookmark
+                # can move in step 9.7
+                bookmark = self.tree.activeFormattingElements.index(formattingElement)
+
+                # Step 9
+                lastNode = node = furthestBlock
+                innerLoopCounter = 0
+
+                index = self.tree.openElements.index(node)
+                while innerLoopCounter < 3:
+                    innerLoopCounter += 1
+                    # Node is element before node in open elements
+                    index -= 1
+                    node = self.tree.openElements[index]
+                    if node not in self.tree.activeFormattingElements:
+                        self.tree.openElements.remove(node)
+                        continue
+                    # Step 9.6
+                    if node == formattingElement:
+                        break
+                    # Step 9.7
+                    if lastNode == furthestBlock:
+                        bookmark = self.tree.activeFormattingElements.index(node) + 1
+                    # Step 9.8
+                    clone = node.cloneNode()
+                    # Replace node with clone
+                    self.tree.activeFormattingElements[
+                        self.tree.activeFormattingElements.index(node)] = clone
+                    self.tree.openElements[
+                        self.tree.openElements.index(node)] = clone
+                    node = clone
+                    # Step 9.9
+                    # Remove lastNode from its parents, if any
+                    if lastNode.parent:
+                        lastNode.parent.removeChild(lastNode)
+                    node.appendChild(lastNode)
+                    # Step 9.10
+                    lastNode = node
+
+                # Step 10
+                # Foster parent lastNode if commonAncestor is a
+                # table, tbody, tfoot, thead, or tr we need to foster
+                # parent the lastNode
+                if lastNode.parent:
+                    lastNode.parent.removeChild(lastNode)
+
+                if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
+                    parent, insertBefore = self.tree.getTableMisnestedNodePosition()
+                    parent.insertBefore(lastNode, insertBefore)
+                else:
+                    commonAncestor.appendChild(lastNode)
+
+                # Step 11
+                clone = formattingElement.cloneNode()
+
+                # Step 12
+                furthestBlock.reparentChildren(clone)
+
+                # Step 13
+                furthestBlock.appendChild(clone)
+
+                # Step 14
+                self.tree.activeFormattingElements.remove(formattingElement)
+                self.tree.activeFormattingElements.insert(bookmark, clone)
 
-    def endTagIgnore(self, token):
-        self.parser.parseError("unexpected-end-tag-in-table-body",
-          {"name": token["name"]})
-
-    def endTagOther(self, token):
-        self.parser.phases["inTable"].processEndTag(token)
-
-
-class InRowPhase(Phase):
-    # http://www.whatwg.org/specs/web-apps/current-work/#in-row
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            (("td", "th"), self.startTagTableCell),
-            (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
-              "tr"), self.startTagTableOther)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([
-            ("tr", self.endTagTr),
-            ("table", self.endTagTable),
-            (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
-            (("body", "caption", "col", "colgroup", "html", "td", "th"),
-              self.endTagIgnore)
-        ])
-        self.endTagHandler.default = self.endTagOther
-
-    # helper methods (XXX unify this with other table helper methods)
-    def clearStackToTableRowContext(self):
-        while self.tree.openElements[-1].name not in ("tr", "html"):
-            self.parser.parseError("unexpected-implied-end-tag-in-table-row",
-              {"name": self.tree.openElements[-1].name})
+                # Step 15
+                self.tree.openElements.remove(formattingElement)
+                self.tree.openElements.insert(
+                    self.tree.openElements.index(furthestBlock) + 1, clone)
+
+        def endTagAppletMarqueeObject(self, token):
+            if self.tree.elementInScope(token["name"]):
+                self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+
+            if self.tree.elementInScope(token["name"]):
+                element = self.tree.openElements.pop()
+                while element.name != token["name"]:
+                    element = self.tree.openElements.pop()
+                self.tree.clearActiveFormattingElements()
+
+        def endTagBr(self, token):
+            self.parser.parseError("unexpected-end-tag-treated-as",
+                                   {"originalName": "br", "newName": "br element"})
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(impliedTagToken("br", "StartTag"))
             self.tree.openElements.pop()
 
-    def ignoreEndTagTr(self):
-        return not self.tree.elementInScope("tr", variant="table")
+        def endTagOther(self, token):
+            for node in self.tree.openElements[::-1]:
+                if node.name == token["name"]:
+                    self.tree.generateImpliedEndTags(exclude=token["name"])
+                    if self.tree.openElements[-1].name != token["name"]:
+                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+                    while self.tree.openElements.pop() != node:
+                        pass
+                    break
+                else:
+                    if node.nameTuple in specialElements:
+                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+                        break
 
-    # the rest
-    def processEOF(self):
-        self.parser.phases["inTable"].processEOF()
-    
-    def processSpaceCharacters(self, token):
-        self.parser.phases["inTable"].processSpaceCharacters(token)        
+    class TextPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = utils.MethodDispatcher([])
+            self.startTagHandler.default = self.startTagOther
+            self.endTagHandler = utils.MethodDispatcher([
+                ("script", self.endTagScript)])
+            self.endTagHandler.default = self.endTagOther
 
-    def processCharacters(self, token):
-        self.parser.phases["inTable"].processCharacters(token)
+        def processCharacters(self, token):
+            self.tree.insertText(token["data"])
 
-    def startTagTableCell(self, token):
-        self.clearStackToTableRowContext()
-        self.tree.insertElement(token)
-        self.parser.phase = self.parser.phases["inCell"]
-        self.tree.activeFormattingElements.append(Marker)
+        def processEOF(self):
+            self.parser.parseError("expected-named-closing-tag-but-got-eof",
+                                   {"name": self.tree.openElements[-1].name})
+            self.tree.openElements.pop()
+            self.parser.phase = self.parser.originalPhase
+            return True
 
-    def startTagTableOther(self, token):
-        ignoreEndTag = self.ignoreEndTagTr()
-        self.endTagTr("tr")
-        # XXX how are we sure it's always ignored in the innerHTML case?
-        if not ignoreEndTag:
-            self.parser.phase.processStartTag(token)
+        def startTagOther(self, token):
+            assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
 
-    def startTagOther(self, token):
-        self.parser.phases["inTable"].processStartTag(token)
+        def endTagScript(self, token):
+            node = self.tree.openElements.pop()
+            assert node.name == "script"
+            self.parser.phase = self.parser.originalPhase
+            # The rest of this method is all stuff that only happens if
+            # document.write works
 
-    def endTagTr(self, token):
-        if not self.ignoreEndTagTr():
-            self.clearStackToTableRowContext()
+        def endTagOther(self, token):
             self.tree.openElements.pop()
+            self.parser.phase = self.parser.originalPhase
+
+    class InTablePhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-table
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("caption", self.startTagCaption),
+                ("colgroup", self.startTagColgroup),
+                ("col", self.startTagCol),
+                (("tbody", "tfoot", "thead"), self.startTagRowGroup),
+                (("td", "th", "tr"), self.startTagImplyTbody),
+                ("table", self.startTagTable),
+                (("style", "script"), self.startTagStyleScript),
+                ("input", self.startTagInput),
+                ("form", self.startTagForm)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = utils.MethodDispatcher([
+                ("table", self.endTagTable),
+                (("body", "caption", "col", "colgroup", "html", "tbody", "td",
+                  "tfoot", "th", "thead", "tr"), self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper methods
+        def clearStackToTableContext(self):
+            # "clear the stack back to a table context"
+            while self.tree.openElements[-1].name not in ("table", "html"):
+                # self.parser.parseError("unexpected-implied-end-tag-in-table",
+                #  {"name":  self.tree.openElements[-1].name})
+                self.tree.openElements.pop()
+            # When the current node is <html> it's an innerHTML case
+
+        # processing methods
+        def processEOF(self):
+            if self.tree.openElements[-1].name != "html":
+                self.parser.parseError("eof-in-table")
+            else:
+                assert self.parser.innerHTML
+            # Stop parsing
+
+        def processSpaceCharacters(self, token):
+            originalPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["inTableText"]
+            self.parser.phase.originalPhase = originalPhase
+            self.parser.phase.processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            originalPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["inTableText"]
+            self.parser.phase.originalPhase = originalPhase
+            self.parser.phase.processCharacters(token)
+
+        def insertText(self, token):
+            # If we get here there must be at least one non-whitespace character
+            # Do the table magic!
+            self.tree.insertFromTable = True
+            self.parser.phases["inBody"].processCharacters(token)
+            self.tree.insertFromTable = False
+
+        def startTagCaption(self, token):
+            self.clearStackToTableContext()
+            self.tree.activeFormattingElements.append(Marker)
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inCaption"]
+
+        def startTagColgroup(self, token):
+            self.clearStackToTableContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inColumnGroup"]
+
+        def startTagCol(self, token):
+            self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
+            return token
+
+        def startTagRowGroup(self, token):
+            self.clearStackToTableContext()
+            self.tree.insertElement(token)
             self.parser.phase = self.parser.phases["inTableBody"]
-        else:
-            # innerHTML case
-            assert self.parser.innerHTML
-            self.parser.parseError()
 
-    def endTagTable(self, token):
-        ignoreEndTag = self.ignoreEndTagTr()
-        self.endTagTr("tr")
-        # Reprocess the current tag if the tr end tag was not ignored
-        # XXX how are we sure it's always ignored in the innerHTML case?
-        if not ignoreEndTag:
-            self.parser.phase.processEndTag(token)
-
-    def endTagTableRowGroup(self, token):
-        if self.tree.elementInScope(token["name"], variant="table"):
-            self.endTagTr("tr")
-            self.parser.phase.processEndTag(token)
-        else:
-            # innerHTML case
+        def startTagImplyTbody(self, token):
+            self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
+            return token
+
+        def startTagTable(self, token):
+            self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                   {"startName": "table", "endName": "table"})
+            self.parser.phase.processEndTag(impliedTagToken("table"))
+            if not self.parser.innerHTML:
+                return token
+
+        def startTagStyleScript(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagInput(self, token):
+            if ("type" in token["data"] and
+                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
+                self.parser.parseError("unexpected-hidden-input-in-table")
+                self.tree.insertElement(token)
+                # XXX associate with form
+                self.tree.openElements.pop()
+            else:
+                self.startTagOther(token)
+
+        def startTagForm(self, token):
+            self.parser.parseError("unexpected-form-in-table")
+            if self.tree.formPointer is None:
+                self.tree.insertElement(token)
+                self.tree.formPointer = self.tree.openElements[-1]
+                self.tree.openElements.pop()
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
+            # Do the table magic!
+            self.tree.insertFromTable = True
+            self.parser.phases["inBody"].processStartTag(token)
+            self.tree.insertFromTable = False
+
+        def endTagTable(self, token):
+            if self.tree.elementInScope("table", variant="table"):
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != "table":
+                    self.parser.parseError("end-tag-too-early-named",
+                                           {"gotName": "table",
+                                            "expectedName": self.tree.openElements[-1].name})
+                while self.tree.openElements[-1].name != "table":
+                    self.tree.openElements.pop()
+                self.tree.openElements.pop()
+                self.parser.resetInsertionMode()
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
+            # Do the table magic!
+            self.tree.insertFromTable = True
+            self.parser.phases["inBody"].processEndTag(token)
+            self.tree.insertFromTable = False
+
+    class InTableTextPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.originalPhase = None
+            self.characterTokens = []
+
+        def flushCharacters(self):
+            data = "".join([item["data"] for item in self.characterTokens])
+            if any([item not in spaceCharacters for item in data]):
+                token = {"type": tokenTypes["Characters"], "data": data}
+                self.parser.phases["inTable"].insertText(token)
+            elif data:
+                self.tree.insertText(data)
+            self.characterTokens = []
+
+        def processComment(self, token):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return token
+
+        def processEOF(self):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return True
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                return
+            self.characterTokens.append(token)
+
+        def processSpaceCharacters(self, token):
+            # pretty sure we should never reach here
+            self.characterTokens.append(token)
+    #        assert False
+
+        def processStartTag(self, token):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return token
+
+        def processEndTag(self, token):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return token
+
+    class InCaptionPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+                  "thead", "tr"), self.startTagTableElement)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = utils.MethodDispatcher([
+                ("caption", self.endTagCaption),
+                ("table", self.endTagTable),
+                (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
+                  "thead", "tr"), self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def ignoreEndTagCaption(self):
+            return not self.tree.elementInScope("caption", variant="table")
+
+        def processEOF(self):
+            self.parser.phases["inBody"].processEOF()
+
+        def processCharacters(self, token):
+            return self.parser.phases["inBody"].processCharacters(token)
+
+        def startTagTableElement(self, token):
             self.parser.parseError()
+            # XXX Have to duplicate logic here to find out if the tag is ignored
+            ignoreEndTag = self.ignoreEndTagCaption()
+            self.parser.phase.processEndTag(impliedTagToken("caption"))
+            if not ignoreEndTag:
+                return token
 
-    def endTagIgnore(self, token):
-        self.parser.parseError("unexpected-end-tag-in-table-row",
-            {"name": token["name"]})
-
-    def endTagOther(self, token):
-        self.parser.phases["inTable"].processEndTag(token)
-
-class InCellPhase(Phase):
-    # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
-              "thead", "tr"), self.startTagTableOther)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([
-            (("td", "th"), self.endTagTableCell),
-            (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
-            (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
-        ])
-        self.endTagHandler.default = self.endTagOther
-
-    # helper
-    def closeCell(self):
-        if self.tree.elementInScope("td", variant="table"):
-            self.endTagTableCell(impliedTagToken("td"))
-        elif self.tree.elementInScope("th", variant="table"):
-            self.endTagTableCell(impliedTagToken("th"))
-
-    # the rest
-    def processEOF(self):
-        self.parser.phases["inBody"].processEOF()
-        
-    def processCharacters(self, token):
-        self.parser.phases["inBody"].processCharacters(token)
-
-    def startTagTableOther(self, token):
-        if (self.tree.elementInScope("td", variant="table") or
-            self.tree.elementInScope("th", variant="table")):
-            self.closeCell()
-            self.parser.phase.processStartTag(token)
-        else:
-            # innerHTML case
+        def startTagOther(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def endTagCaption(self, token):
+            if not self.ignoreEndTagCaption():
+                # AT this code is quite similar to endTagTable in "InTable"
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != "caption":
+                    self.parser.parseError("expected-one-end-tag-but-got-another",
+                                           {"gotName": "caption",
+                                            "expectedName": self.tree.openElements[-1].name})
+                while self.tree.openElements[-1].name != "caption":
+                    self.tree.openElements.pop()
+                self.tree.openElements.pop()
+                self.tree.clearActiveFormattingElements()
+                self.parser.phase = self.parser.phases["inTable"]
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagTable(self, token):
             self.parser.parseError()
+            ignoreEndTag = self.ignoreEndTagCaption()
+            self.parser.phase.processEndTag(impliedTagToken("caption"))
+            if not ignoreEndTag:
+                return token
 
-    def startTagOther(self, token):
-        self.parser.phases["inBody"].processStartTag(token)
-        # Optimize this for subsequent invocations. Can't do this initially
-        # because self.phases doesn't really exist at that point.
-        self.startTagHandler.default =\
-          self.parser.phases["inBody"].processStartTag
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
-    def endTagTableCell(self, token):
-        if self.tree.elementInScope(token["name"], variant="table"):
-            self.tree.generateImpliedEndTags(token["name"])
-            if self.tree.openElements[-1].name != token["name"]:
-                self.parser.parseError("unexpected-cell-end-tag",
-                  {"name": token["name"]})
-                while True:
-                    node = self.tree.openElements.pop()
-                    if node.name == token["name"]:
-                        break
+        def endTagOther(self, token):
+            return self.parser.phases["inBody"].processEndTag(token)
+
+    class InColumnGroupPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-column
+
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("col", self.startTagCol)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = utils.MethodDispatcher([
+                ("colgroup", self.endTagColgroup),
+                ("col", self.endTagCol)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def ignoreEndTagColgroup(self):
+            return self.tree.openElements[-1].name == "html"
+
+        def processEOF(self):
+            if self.tree.openElements[-1].name == "html":
+                assert self.parser.innerHTML
+                return
+            else:
+                ignoreEndTag = self.ignoreEndTagColgroup()
+                self.endTagColgroup(impliedTagToken("colgroup"))
+                if not ignoreEndTag:
+                    return True
+
+        def processCharacters(self, token):
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup(impliedTagToken("colgroup"))
+            if not ignoreEndTag:
+                return token
+
+        def startTagCol(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+
+        def startTagOther(self, token):
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup(impliedTagToken("colgroup"))
+            if not ignoreEndTag:
+                return token
+
+        def endTagColgroup(self, token):
+            if self.ignoreEndTagColgroup():
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
             else:
                 self.tree.openElements.pop()
-            self.tree.clearActiveFormattingElements()
+                self.parser.phase = self.parser.phases["inTable"]
+
+        def endTagCol(self, token):
+            self.parser.parseError("no-end-tag", {"name": "col"})
+
+        def endTagOther(self, token):
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup(impliedTagToken("colgroup"))
+            if not ignoreEndTag:
+                return token
+
+    class InTableBodyPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("tr", self.startTagTr),
+                (("td", "th"), self.startTagTableCell),
+                (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
+                 self.startTagTableOther)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = utils.MethodDispatcher([
+                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+                ("table", self.endTagTable),
+                (("body", "caption", "col", "colgroup", "html", "td", "th",
+                  "tr"), self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper methods
+        def clearStackToTableBodyContext(self):
+            while self.tree.openElements[-1].name not in ("tbody", "tfoot",
+                                                          "thead", "html"):
+                # self.parser.parseError("unexpected-implied-end-tag-in-table",
+                #  {"name": self.tree.openElements[-1].name})
+                self.tree.openElements.pop()
+            if self.tree.openElements[-1].name == "html":
+                assert self.parser.innerHTML
+
+        # the rest
+        def processEOF(self):
+            self.parser.phases["inTable"].processEOF()
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inTable"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            return self.parser.phases["inTable"].processCharacters(token)
+
+        def startTagTr(self, token):
+            self.clearStackToTableBodyContext()
+            self.tree.insertElement(token)
             self.parser.phase = self.parser.phases["inRow"]
-        else:
-            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
-    def endTagIgnore(self, token):
-        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+        def startTagTableCell(self, token):
+            self.parser.parseError("unexpected-cell-in-table-body",
+                                   {"name": token["name"]})
+            self.startTagTr(impliedTagToken("tr", "StartTag"))
+            return token
+
+        def startTagTableOther(self, token):
+            # XXX AT Any ideas on how to share this with endTagTable?
+            if (self.tree.elementInScope("tbody", variant="table") or
+                self.tree.elementInScope("thead", variant="table") or
+                    self.tree.elementInScope("tfoot", variant="table")):
+                self.clearStackToTableBodyContext()
+                self.endTagTableRowGroup(
+                    impliedTagToken(self.tree.openElements[-1].name))
+                return token
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
 
-    def endTagImply(self, token):
-        if self.tree.elementInScope(token["name"], variant="table"):
-            self.closeCell()
-            self.parser.phase.processEndTag(token)
-        else:
-            # sometimes innerHTML case
-            self.parser.parseError()
+        def startTagOther(self, token):
+            return self.parser.phases["inTable"].processStartTag(token)
 
-    def endTagOther(self, token):
-        self.parser.phases["inBody"].processEndTag(token)
-        # Optimize this for subsequent invocations. Can't do this initially
-        # because self.phases doesn't really exist at that point.
-        self.endTagHandler.default = self.parser.phases["inBody"].processEndTag
-
-
-class InSelectPhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("option", self.startTagOption),
-            ("optgroup", self.startTagOptgroup),
-            ("select", self.startTagSelect),
-            (("input", "keygen", "textarea"), self.startTagInput)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([
-            ("option", self.endTagOption),
-            ("optgroup", self.endTagOptgroup),
-            ("select", self.endTagSelect),
-            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td",
-              "th"), self.endTagTableElements)
-        ])
-        self.endTagHandler.default = self.endTagOther
-
-    # http://www.whatwg.org/specs/web-apps/current-work/#in-select
-    def processEOF(self):
-        if self.tree.openElements[-1].name != "html":
-            self.parser.parseError("eof-in-select")
-        else:
-            assert self.parser.innerHTML
+        def endTagTableRowGroup(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.clearStackToTableBodyContext()
+                self.tree.openElements.pop()
+                self.parser.phase = self.parser.phases["inTable"]
+            else:
+                self.parser.parseError("unexpected-end-tag-in-table-body",
+                                       {"name": token["name"]})
+
+        def endTagTable(self, token):
+            if (self.tree.elementInScope("tbody", variant="table") or
+                self.tree.elementInScope("thead", variant="table") or
+                    self.tree.elementInScope("tfoot", variant="table")):
+                self.clearStackToTableBodyContext()
+                self.endTagTableRowGroup(
+                    impliedTagToken(self.tree.openElements[-1].name))
+                return token
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
 
-    def processCharacters(self, token):
-        self.tree.insertText(token["data"])
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag-in-table-body",
+                                   {"name": token["name"]})
+
+        def endTagOther(self, token):
+            return self.parser.phases["inTable"].processEndTag(token)
+
+    class InRowPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-row
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("td", "th"), self.startTagTableCell),
+                (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
+                  "tr"), self.startTagTableOther)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = utils.MethodDispatcher([
+                ("tr", self.endTagTr),
+                ("table", self.endTagTable),
+                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+                (("body", "caption", "col", "colgroup", "html", "td", "th"),
+                 self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
 
-    def startTagOption(self, token):
-        # We need to imply </option> if <option> is the current node.
-        if self.tree.openElements[-1].name == "option":
-            self.tree.openElements.pop()
-        self.tree.insertElement(token)
+        # helper methods (XXX unify this with other table helper methods)
+        def clearStackToTableRowContext(self):
+            while self.tree.openElements[-1].name not in ("tr", "html"):
+                self.parser.parseError("unexpected-implied-end-tag-in-table-row",
+                                       {"name": self.tree.openElements[-1].name})
+                self.tree.openElements.pop()
 
-    def startTagOptgroup(self, token):
-        if self.tree.openElements[-1].name == "option":
-            self.tree.openElements.pop()
-        if self.tree.openElements[-1].name == "optgroup":
-            self.tree.openElements.pop()
-        self.tree.insertElement(token)
+        def ignoreEndTagTr(self):
+            return not self.tree.elementInScope("tr", variant="table")
 
-    def startTagSelect(self, token):
-        self.parser.parseError("unexpected-select-in-select")
-        self.endTagSelect("select")
+        # the rest
+        def processEOF(self):
+            self.parser.phases["inTable"].processEOF()
 
-    def startTagInput(self, token):
-        self.parser.parseError("unexpected-input-in-select")
-        if self.tree.elementInScope("select", variant="table"):
-            self.endTagSelect("select")
-            self.parser.phase.processStartTag(token)
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inTable"].processSpaceCharacters(token)
 
-    def startTagOther(self, token):
-        self.parser.parseError("unexpected-start-tag-in-select",
-          {"name": token["name"]})
+        def processCharacters(self, token):
+            return self.parser.phases["inTable"].processCharacters(token)
 
-    def endTagOption(self, token):
-        if self.tree.openElements[-1].name == "option":
-            self.tree.openElements.pop()
-        else:
-            self.parser.parseError("unexpected-end-tag-in-select",
-              {"name": "option"})
+        def startTagTableCell(self, token):
+            self.clearStackToTableRowContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inCell"]
+            self.tree.activeFormattingElements.append(Marker)
 
-    def endTagOptgroup(self, token):
-        # </optgroup> implicitly closes <option>
-        if (self.tree.openElements[-1].name == "option" and
-            self.tree.openElements[-2].name == "optgroup"):
-            self.tree.openElements.pop()
-        # It also closes </optgroup>
-        if self.tree.openElements[-1].name == "optgroup":
-            self.tree.openElements.pop()
-        # But nothing else
-        else:
-            self.parser.parseError("unexpected-end-tag-in-select",
-              {"name": "optgroup"})
+        def startTagTableOther(self, token):
+            ignoreEndTag = self.ignoreEndTagTr()
+            self.endTagTr(impliedTagToken("tr"))
+            # XXX how are we sure it's always ignored in the innerHTML case?
+            if not ignoreEndTag:
+                return token
 
-    def endTagSelect(self, token):
-        if self.tree.elementInScope("select", variant="table"):
-            node = self.tree.openElements.pop()
-            while node.name != "select":
-                node = self.tree.openElements.pop()
-            self.parser.resetInsertionMode()
-        else:
-            # innerHTML case
-            self.parser.parseError()
+        def startTagOther(self, token):
+            return self.parser.phases["inTable"].processStartTag(token)
 
-    def endTagTableElements(self, token):
-        self.parser.parseError("unexpected-end-tag-in-select",
-          {"name": token["name"]})
-        if self.tree.elementInScope(token["name"], variant="table"):
-            self.endTagSelect("select")
-            self.parser.phase.processEndTag(token)
-
-    def endTagOther(self, token):
-        self.parser.parseError("unexpected-end-tag-in-select",
-          {"name": token["name"]})
-
-
-class InSelectInTablePhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-
-        self.startTagHandler = utils.MethodDispatcher([
-            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
-             self.startTagTable)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([
-            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
-             self.endTagTable)
-        ])
-        self.endTagHandler.default = self.endTagOther
-
-    def processEOF(self):
-        self.parser.phases["inSelect"].processEOF()
-
-    def processCharacters(self, token):
-        self.parser.phases["inSelect"].processCharacters(token)
-    
-    def startTagTable(self, token):
-        self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
-        self.endTagOther(impliedTagToken("select"))
-        self.parser.phase.processStartTag(token)
-
-    def startTagOther(self, token):
-        self.parser.phases["inSelect"].processStartTag(token)
-
-    def endTagTable(self, token):
-        self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
-        if self.tree.elementInScope(token["name"], variant="table"):
-            self.endTagOther(impliedTagToken("select"))
-            self.parser.phase.processEndTag(token)
-
-    def endTagOther(self, token):
-        self.parser.phases["inSelect"].processEndTag(token)
-
-
-class InForeignContentPhase(Phase):
-    breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", 
-                                  "center", "code", "dd", "div", "dl", "dt",
-                                  "em", "embed", "font", "h1", "h2", "h3", 
-                                  "h4", "h5", "h6", "head", "hr", "i", "img",
-                                  "li", "listing", "menu", "meta", "nobr", 
-                                  "ol", "p", "pre", "ruby", "s",  "small", 
-                                  "span", "strong", "strike",  "sub", "sup", 
-                                  "table", "tt", "u", "ul", "var"])
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-
-    def nonHTMLElementInScope(self):
-        for element in self.tree.openElements[::-1]:
-            if element.namespace == self.tree.defaultNamespace:
-                return self.tree.elementInScope(element)
-        assert False
-        for item in self.tree.openElements[::-1]:
-            if item.namespace == self.tree.defaultNamespace:
-                return True
-            elif item.nameTuple in scopingElements:
-                return False
-        return False
-
-    def adjustSVGTagNames(self, token):
-        replacements = {"altglyph":"altGlyph",
-                        "altglyphdef":"altGlyphDef",
-                        "altglyphitem":"altGlyphItem",
-                        "animatecolor":"animateColor",
-                        "animatemotion":"animateMotion",
-                        "animatetransform":"animateTransform",
-                        "clippath":"clipPath",
-                        "feblend":"feBlend",
-                        "fecolormatrix":"feColorMatrix",
-                        "fecomponenttransfer":"feComponentTransfer",
-                        "fecomposite":"feComposite",
-                        "feconvolvematrix":"feConvolveMatrix",
-                        "fediffuselighting":"feDiffuseLighting",
-                        "fedisplacementmap":"feDisplacementMap",
-                        "fedistantlight":"feDistantLight",
-                        "feflood":"feFlood",
-                        "fefunca":"feFuncA",
-                        "fefuncb":"feFuncB",
-                        "fefuncg":"feFuncG",
-                        "fefuncr":"feFuncR",
-                        "fegaussianblur":"feGaussianBlur",
-                        "feimage":"feImage",
-                        "femerge":"feMerge",
-                        "femergenode":"feMergeNode",
-                        "femorphology":"feMorphology",
-                        "feoffset":"feOffset",
-                        "fepointlight":"fePointLight",
-                        "fespecularlighting":"feSpecularLighting",
-                        "fespotlight":"feSpotLight",
-                        "fetile":"feTile",
-                        "feturbulence":"feTurbulence",
-                        "foreignobject":"foreignObject",
-                        "glyphref":"glyphRef",
-                        "lineargradient":"linearGradient",
-                        "radialgradient":"radialGradient",
-                        "textpath":"textPath"}
-
-        if token["name"] in replacements:
-            token["name"] = replacements[token["name"]]
-
-    def processCharacters(self, token):
-        self.parser.framesetOK = False
-        Phase.processCharacters(self, token)
-
-    def processEOF(self):
-        pass
-
-    def processStartTag(self, token):
-        currentNode = self.tree.openElements[-1]
-        if (currentNode.namespace == self.tree.defaultNamespace or
-            (currentNode.namespace == namespaces["mathml"] and 
-             token["name"] not in frozenset(["mglyph", "malignmark"]) and
-             currentNode.name in frozenset(["mi", "mo", "mn", 
-                                            "ms", "mtext"])) or
-            (currentNode.namespace == namespaces["mathml"] and
-             currentNode.name == "annotation-xml" and
-             token["name"] == "svg") or
-            (currentNode.namespace == namespaces["svg"] and 
-             currentNode.name in frozenset(["foreignObject", 
-                                            "desc", "title"])
-             )):
-            assert self.parser.secondaryPhase != self
-            self.parser.secondaryPhase.processStartTag(token)
-            if self.parser.phase == self and self.nonHTMLElementInScope():
-                self.parser.phase = self.parser.secondaryPhase
-        elif token["name"] in self.breakoutElements:
-            self.parser.parseError("unexpected-html-element-in-foreign-content",
-                                   token["name"])
-            while (self.tree.openElements[-1].namespace !=
-                   self.tree.defaultNamespace):
+        def endTagTr(self, token):
+            if not self.ignoreEndTagTr():
+                self.clearStackToTableRowContext()
+                self.tree.openElements.pop()
+                self.parser.phase = self.parser.phases["inTableBody"]
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagTable(self, token):
+            ignoreEndTag = self.ignoreEndTagTr()
+            self.endTagTr(impliedTagToken("tr"))
+            # Reprocess the current tag if the tr end tag was not ignored
+            # XXX how are we sure it's always ignored in the innerHTML case?
+            if not ignoreEndTag:
+                return token
+
+        def endTagTableRowGroup(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.endTagTr(impliedTagToken("tr"))
+                return token
+            else:
+                self.parser.parseError()
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag-in-table-row",
+                                   {"name": token["name"]})
+
+        def endTagOther(self, token):
+            return self.parser.phases["inTable"].processEndTag(token)
+
+    class InCellPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+                  "thead", "tr"), self.startTagTableOther)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = utils.MethodDispatcher([
+                (("td", "th"), self.endTagTableCell),
+                (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
+                (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper
+        def closeCell(self):
+            if self.tree.elementInScope("td", variant="table"):
+                self.endTagTableCell(impliedTagToken("td"))
+            elif self.tree.elementInScope("th", variant="table"):
+                self.endTagTableCell(impliedTagToken("th"))
+
+        # the rest
+        def processEOF(self):
+            self.parser.phases["inBody"].processEOF()
+
+        def processCharacters(self, token):
+            return self.parser.phases["inBody"].processCharacters(token)
+
+        def startTagTableOther(self, token):
+            if (self.tree.elementInScope("td", variant="table") or
+                    self.tree.elementInScope("th", variant="table")):
+                self.closeCell()
+                return token
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def startTagOther(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def endTagTableCell(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.tree.generateImpliedEndTags(token["name"])
+                if self.tree.openElements[-1].name != token["name"]:
+                    self.parser.parseError("unexpected-cell-end-tag",
+                                           {"name": token["name"]})
+                    while True:
+                        node = self.tree.openElements.pop()
+                        if node.name == token["name"]:
+                            break
+                else:
+                    self.tree.openElements.pop()
+                self.tree.clearActiveFormattingElements()
+                self.parser.phase = self.parser.phases["inRow"]
+            else:
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagImply(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.closeCell()
+                return token
+            else:
+                # sometimes innerHTML case
+                self.parser.parseError()
+
+        def endTagOther(self, token):
+            return self.parser.phases["inBody"].processEndTag(token)
+
+    class InSelectPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("option", self.startTagOption),
+                ("optgroup", self.startTagOptgroup),
+                ("select", self.startTagSelect),
+                (("input", "keygen", "textarea"), self.startTagInput),
+                ("script", self.startTagScript)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = utils.MethodDispatcher([
+                ("option", self.endTagOption),
+                ("optgroup", self.endTagOptgroup),
+                ("select", self.endTagSelect)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-select
+        def processEOF(self):
+            if self.tree.openElements[-1].name != "html":
+                self.parser.parseError("eof-in-select")
+            else:
+                assert self.parser.innerHTML
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                return
+            self.tree.insertText(token["data"])
+
+        def startTagOption(self, token):
+            # We need to imply </option> if <option> is the current node.
+            if self.tree.openElements[-1].name == "option":
                 self.tree.openElements.pop()
-            self.parser.phase = self.parser.secondaryPhase
-            self.parser.phase.processStartTag(token)
-        else:
-            if currentNode.namespace == namespaces["mathml"]:
-                self.parser.adjustMathMLAttributes(token)
-            elif currentNode.namespace == namespaces["svg"]:
-                self.adjustSVGTagNames(token)
-                self.parser.adjustSVGAttributes(token)
-            self.parser.adjustForeignAttributes(token)
-            token["namespace"] = currentNode.namespace
             self.tree.insertElement(token)
-            if token["selfClosing"]:
+
+        def startTagOptgroup(self, token):
+            if self.tree.openElements[-1].name == "option":
                 self.tree.openElements.pop()
-                token["selfClosingAcknowledged"] = True
+            if self.tree.openElements[-1].name == "optgroup":
+                self.tree.openElements.pop()
+            self.tree.insertElement(token)
 
-    def processEndTag(self, token):
-        self.adjustSVGTagNames(token)
-        self.parser.secondaryPhase.processEndTag(token)
-        if self.parser.phase == self and self.nonHTMLElementInScope():
-            self.parser.phase = self.parser.secondaryPhase
+        def startTagSelect(self, token):
+            self.parser.parseError("unexpected-select-in-select")
+            self.endTagSelect(impliedTagToken("select"))
 
-class AfterBodyPhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
+        def startTagInput(self, token):
+            self.parser.parseError("unexpected-input-in-select")
+            if self.tree.elementInScope("select", variant="select"):
+                self.endTagSelect(impliedTagToken("select"))
+                return token
+            else:
+                assert self.parser.innerHTML
 
-        self.startTagHandler = utils.MethodDispatcher([
-                ("html", self.startTagHtml)
-                ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
-        self.endTagHandler.default = self.endTagOther
-
-    def processEOF(self):
-        #Stop parsing
-        pass
-    
-    def processComment(self, token):
-        # This is needed because data is to be appended to the <html> element
-        # here and not to whatever is currently open.
-        self.tree.insertComment(token, self.tree.openElements[0])
-
-    def processCharacters(self, token):
-        self.parser.parseError("unexpected-char-after-body")
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processCharacters(token)
-
-    def startTagHtml(self, token):
-        self.parser.phases["inBody"].processStartTag(token)
-
-    def startTagOther(self, token):
-        self.parser.parseError("unexpected-start-tag-after-body",
-          {"name": token["name"]})
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processStartTag(token)
-
-    def endTagHtml(self,name):
-        if self.parser.innerHTML:
-            self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
-        else:
-            self.parser.phase = self.parser.phases["afterAfterBody"]
-
-    def endTagOther(self, token):
-        self.parser.parseError("unexpected-end-tag-after-body",
-          {"name": token["name"]})
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processEndTag(token)
-
-class InFramesetPhase(Phase):
-    # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
-
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("frameset", self.startTagFrameset),
-            ("frame", self.startTagFrame),
-            ("noframes", self.startTagNoframes)
-        ])
-        self.startTagHandler.default = self.startTagOther
-
-        self.endTagHandler = utils.MethodDispatcher([
-            ("frameset", self.endTagFrameset),
-            ("noframes", self.endTagNoframes)
-        ])
-        self.endTagHandler.default = self.endTagOther
-
-    def processEOF(self):
-        if self.tree.openElements[-1].name != "html":
-            self.parser.parseError("eof-in-frameset")
-        else:
-            assert self.parser.innerHTML
+        def startTagScript(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
 
-    def processCharacters(self, token):
-        self.parser.parseError("unexpected-char-in-frameset")
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-in-select",
+                                   {"name": token["name"]})
 
-    def startTagFrameset(self, token):
-        self.tree.insertElement(token)
+        def endTagOption(self, token):
+            if self.tree.openElements[-1].name == "option":
+                self.tree.openElements.pop()
+            else:
+                self.parser.parseError("unexpected-end-tag-in-select",
+                                       {"name": "option"})
 
-    def startTagFrame(self, token):
-        self.tree.insertElement(token)
-        self.tree.openElements.pop()
+        def endTagOptgroup(self, token):
+            # </optgroup> implicitly closes <option>
+            if (self.tree.openElements[-1].name == "option" and
+                    self.tree.openElements[-2].name == "optgroup"):
+                self.tree.openElements.pop()
+            # It also closes </optgroup>
+            if self.tree.openElements[-1].name == "optgroup":
+                self.tree.openElements.pop()
+            # But nothing else
+            else:
+                self.parser.parseError("unexpected-end-tag-in-select",
+                                       {"name": "optgroup"})
 
-    def startTagNoframes(self, token):
-        self.parser.phases["inBody"].processStartTag(token)
+        def endTagSelect(self, token):
+            if self.tree.elementInScope("select", variant="select"):
+                node = self.tree.openElements.pop()
+                while node.name != "select":
+                    node = self.tree.openElements.pop()
+                self.parser.resetInsertionMode()
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-in-select",
+                                   {"name": token["name"]})
 
-    def startTagOther(self, token):
-        self.parser.parseError("unexpected-start-tag-in-frameset",
-          {"name": token["name"]})
+    class InSelectInTablePhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
 
-    def endTagFrameset(self, token):
-        if self.tree.openElements[-1].name == "html":
-            # innerHTML case
-            self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
-        else:
-            self.tree.openElements.pop()
-        if (not self.parser.innerHTML and
-            self.tree.openElements[-1].name != "frameset"):
-            # If we're not in innerHTML mode and the the current node is not a
-            # "frameset" element (anymore) then switch.
-            self.parser.phase = self.parser.phases["afterFrameset"]
+            self.startTagHandler = utils.MethodDispatcher([
+                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+                 self.startTagTable)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = utils.MethodDispatcher([
+                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+                 self.endTagTable)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.parser.phases["inSelect"].processEOF()
+
+        def processCharacters(self, token):
+            return self.parser.phases["inSelect"].processCharacters(token)
+
+        def startTagTable(self, token):
+            self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
+            self.endTagOther(impliedTagToken("select"))
+            return token
+
+        def startTagOther(self, token):
+            return self.parser.phases["inSelect"].processStartTag(token)
+
+        def endTagTable(self, token):
+            self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.endTagOther(impliedTagToken("select"))
+                return token
+
+        def endTagOther(self, token):
+            return self.parser.phases["inSelect"].processEndTag(token)
+
+    class InForeignContentPhase(Phase):
+        breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
+                                      "center", "code", "dd", "div", "dl", "dt",
+                                      "em", "embed", "h1", "h2", "h3",
+                                      "h4", "h5", "h6", "head", "hr", "i", "img",
+                                      "li", "listing", "menu", "meta", "nobr",
+                                      "ol", "p", "pre", "ruby", "s", "small",
+                                      "span", "strong", "strike", "sub", "sup",
+                                      "table", "tt", "u", "ul", "var"])
+
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+        def adjustSVGTagNames(self, token):
+            replacements = {"altglyph": "altGlyph",
+                            "altglyphdef": "altGlyphDef",
+                            "altglyphitem": "altGlyphItem",
+                            "animatecolor": "animateColor",
+                            "animatemotion": "animateMotion",
+                            "animatetransform": "animateTransform",
+                            "clippath": "clipPath",
+                            "feblend": "feBlend",
+                            "fecolormatrix": "feColorMatrix",
+                            "fecomponenttransfer": "feComponentTransfer",
+                            "fecomposite": "feComposite",
+                            "feconvolvematrix": "feConvolveMatrix",
+                            "fediffuselighting": "feDiffuseLighting",
+                            "fedisplacementmap": "feDisplacementMap",
+                            "fedistantlight": "feDistantLight",
+                            "feflood": "feFlood",
+                            "fefunca": "feFuncA",
+                            "fefuncb": "feFuncB",
+                            "fefuncg": "feFuncG",
+                            "fefuncr": "feFuncR",
+                            "fegaussianblur": "feGaussianBlur",
+                            "feimage": "feImage",
+                            "femerge": "feMerge",
+                            "femergenode": "feMergeNode",
+                            "femorphology": "feMorphology",
+                            "feoffset": "feOffset",
+                            "fepointlight": "fePointLight",
+                            "fespecularlighting": "feSpecularLighting",
+                            "fespotlight": "feSpotLight",
+                            "fetile": "feTile",
+                            "feturbulence": "feTurbulence",
+                            "foreignobject": "foreignObject",
+                            "glyphref": "glyphRef",
+                            "lineargradient": "linearGradient",
+                            "radialgradient": "radialGradient",
+                            "textpath": "textPath"}
+
+            if token["name"] in replacements:
+                token["name"] = replacements[token["name"]]
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                token["data"] = "\uFFFD"
+            elif (self.parser.framesetOK and
+                  any(char not in spaceCharacters for char in token["data"])):
+                self.parser.framesetOK = False
+            Phase.processCharacters(self, token)
+
+        def processStartTag(self, token):
+            currentNode = self.tree.openElements[-1]
+            if (token["name"] in self.breakoutElements or
+                (token["name"] == "font" and
+                 set(token["data"].keys()) & set(["color", "face", "size"]))):
+                self.parser.parseError("unexpected-html-element-in-foreign-content",
+                                       {"name": token["name"]})
+                while (self.tree.openElements[-1].namespace !=
+                       self.tree.defaultNamespace and
+                       not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
+                       not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
+                    self.tree.openElements.pop()
+                return token
+
+            else:
+                if currentNode.namespace == namespaces["mathml"]:
+                    self.parser.adjustMathMLAttributes(token)
+                elif currentNode.namespace == namespaces["svg"]:
+                    self.adjustSVGTagNames(token)
+                    self.parser.adjustSVGAttributes(token)
+                self.parser.adjustForeignAttributes(token)
+                token["namespace"] = currentNode.namespace
+                self.tree.insertElement(token)
+                if token["selfClosing"]:
+                    self.tree.openElements.pop()
+                    token["selfClosingAcknowledged"] = True
+
+        def processEndTag(self, token):
+            nodeIndex = len(self.tree.openElements) - 1
+            node = self.tree.openElements[-1]
+            if node.namespace == namespaces["svg"]:
+                self.adjustSVGTagNames(token)
+            if node.name != token["name"]:
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+            while True:
+                if node.name.translate(asciiUpper2Lower) == token["name"]:
+                    # XXX this isn't in the spec but it seems necessary
+                    if self.parser.phase == self.parser.phases["inTableText"]:
+                        self.parser.phase.flushCharacters()
+                        self.parser.phase = self.parser.phase.originalPhase
+                    while self.tree.openElements.pop() != node:
+                        assert self.tree.openElements
+                    new_token = None
+                    break
+                nodeIndex -= 1
+
+                node = self.tree.openElements[nodeIndex]
+                if node.namespace != self.tree.defaultNamespace:
+                    continue
+                else:
+                    new_token = self.parser.phase.processEndTag(token)
+                    break
+            return new_token
+
+    class AfterBodyPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml)
+            ])
+            self.startTagHandler.default = self.startTagOther
 
-    def endTagNoframes(self, token):
-        self.parser.phases["inBody"].processEndTag(token)
+            self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
+            self.endTagHandler.default = self.endTagOther
 
-    def endTagOther(self, token):
-        self.parser.parseError("unexpected-end-tag-in-frameset",
-          {"name": token["name"]})
+        def processEOF(self):
+            # Stop parsing
+            pass
 
+        def processComment(self, token):
+            # This is needed because data is to be appended to the <html> element
+            # here and not to whatever is currently open.
+            self.tree.insertComment(token, self.tree.openElements[0])
 
-class AfterFramesetPhase(Phase):
-    # http://www.whatwg.org/specs/web-apps/current-work/#after3
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
+        def processCharacters(self, token):
+            self.parser.parseError("unexpected-char-after-body")
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
 
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("noframes", self.startTagNoframes)
-        ])
-        self.startTagHandler.default = self.startTagOther
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
 
-        self.endTagHandler = utils.MethodDispatcher([
-            ("html", self.endTagHtml)
-        ])
-        self.endTagHandler.default = self.endTagOther
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-after-body",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
 
-    def processEOF(self):
-        #Stop parsing
-        pass
+        def endTagHtml(self, name):
+            if self.parser.innerHTML:
+                self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
+            else:
+                self.parser.phase = self.parser.phases["afterAfterBody"]
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-after-body",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+    class InFramesetPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("frameset", self.startTagFrameset),
+                ("frame", self.startTagFrame),
+                ("noframes", self.startTagNoframes)
+            ])
+            self.startTagHandler.default = self.startTagOther
 
-    def processCharacters(self, token):
-        self.parser.parseError("unexpected-char-after-frameset")
+            self.endTagHandler = utils.MethodDispatcher([
+                ("frameset", self.endTagFrameset)
+            ])
+            self.endTagHandler.default = self.endTagOther
 
-    def startTagNoframes(self, token):
-        self.parser.phases["inHead"].processStartTag(token)
+        def processEOF(self):
+            if self.tree.openElements[-1].name != "html":
+                self.parser.parseError("eof-in-frameset")
+            else:
+                assert self.parser.innerHTML
 
-    def startTagOther(self, token):
-        self.parser.parseError("unexpected-start-tag-after-frameset",
-          {"name": token["name"]})
+        def processCharacters(self, token):
+            self.parser.parseError("unexpected-char-in-frameset")
 
-    def endTagHtml(self, token):
-        self.parser.phase = self.parser.phases["afterAfterFrameset"]
+        def startTagFrameset(self, token):
+            self.tree.insertElement(token)
 
-    def endTagOther(self, token):
-        self.parser.parseError("unexpected-end-tag-after-frameset",
-          {"name": token["name"]})
+        def startTagFrame(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
 
+        def startTagNoframes(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
 
-class AfterAfterBodyPhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-in-frameset",
+                                   {"name": token["name"]})
 
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml)
-        ])
-        self.startTagHandler.default = self.startTagOther
+        def endTagFrameset(self, token):
+            if self.tree.openElements[-1].name == "html":
+                # innerHTML case
+                self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
+            else:
+                self.tree.openElements.pop()
+            if (not self.parser.innerHTML and
+                    self.tree.openElements[-1].name != "frameset"):
+                # If we're not in innerHTML mode and the the current node is not a
+                # "frameset" element (anymore) then switch.
+                self.parser.phase = self.parser.phases["afterFrameset"]
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-in-frameset",
+                                   {"name": token["name"]})
+
+    class AfterFramesetPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#after3
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("noframes", self.startTagNoframes)
+            ])
+            self.startTagHandler.default = self.startTagOther
 
-    def processEOF(self):
-        pass
+            self.endTagHandler = utils.MethodDispatcher([
+                ("html", self.endTagHtml)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            # Stop parsing
+            pass
 
-    def processComment(self, token):
-        self.tree.insertComment(token, self.tree.document)
+        def processCharacters(self, token):
+            self.parser.parseError("unexpected-char-after-frameset")
 
-    def processSpaceCharacters(self, token):
-        self.parser.phases["inBody"].processSpaceCharacters(token)
+        def startTagNoframes(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
 
-    def processCharacters(self, token):
-        self.parser.parseError("expected-eof-but-got-char")
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processCharacters(token)
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-after-frameset",
+                                   {"name": token["name"]})
 
-    def startTagHtml(self, token):
-        self.parser.phases["inBody"].processStartTag(token)
+        def endTagHtml(self, token):
+            self.parser.phase = self.parser.phases["afterAfterFrameset"]
 
-    def startTagOther(self, token):
-        self.parser.parseError("expected-eof-but-got-start-tag",
-          {"name": token["name"]})
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processStartTag(token)
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-after-frameset",
+                                   {"name": token["name"]})
 
-    def processEndTag(self, token):
-        self.parser.parseError("expected-eof-but-got-end-tag",
-          {"name": token["name"]})
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processEndTag(token)
+    class AfterAfterBodyPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml)
+            ])
+            self.startTagHandler.default = self.startTagOther
 
-class AfterAfterFramesetPhase(Phase):
-    def __init__(self, parser, tree):
-        Phase.__init__(self, parser, tree)
+        def processEOF(self):
+            pass
 
-        self.startTagHandler = utils.MethodDispatcher([
-            ("html", self.startTagHtml),
-            ("noframes", self.startTagNoFrames)
-        ])
-        self.startTagHandler.default = self.startTagOther
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
 
-    def processEOF(self):
-        pass
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inBody"].processSpaceCharacters(token)
 
-    def processComment(self, token):
-        self.tree.insertComment(token, self.tree.document)
+        def processCharacters(self, token):
+            self.parser.parseError("expected-eof-but-got-char")
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
 
-    def processSpaceCharacters(self, token):
-        self.parser.phases["inBody"].processSpaceCharacters(token)
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
 
-    def processCharacters(self, token):
-        self.parser.parseError("expected-eof-but-got-char")
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processCharacters(token)
+        def startTagOther(self, token):
+            self.parser.parseError("expected-eof-but-got-start-tag",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
 
-    def startTagHtml(self, token):
-        self.parser.phases["inBody"].processStartTag(token)
+        def processEndTag(self, token):
+            self.parser.parseError("expected-eof-but-got-end-tag",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
 
-    def startTagNoFrames(self, token):
-        self.parser.phases["inHead"].processStartTag(token)
+    class AfterAfterFramesetPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
 
-    def startTagOther(self, token):
-        self.parser.parseError("expected-eof-but-got-start-tag",
-          {"name": token["name"]})
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processStartTag(token)
+            self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("noframes", self.startTagNoFrames)
+            ])
+            self.startTagHandler.default = self.startTagOther
 
-    def processEndTag(self, token):
-        self.parser.parseError("expected-eof-but-got-end-tag",
-          {"name": token["name"]})
-        self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processEndTag(token)
+        def processEOF(self):
+            pass
 
-def impliedTagToken(name, type="EndTag", attributes = None, 
-                    selfClosing = False):
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inBody"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            self.parser.parseError("expected-eof-but-got-char")
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagNoFrames(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("expected-eof-but-got-start-tag",
+                                   {"name": token["name"]})
+
+        def processEndTag(self, token):
+            self.parser.parseError("expected-eof-but-got-end-tag",
+                                   {"name": token["name"]})
+
+    return {
+        "initial": InitialPhase,
+        "beforeHtml": BeforeHtmlPhase,
+        "beforeHead": BeforeHeadPhase,
+        "inHead": InHeadPhase,
+        # XXX "inHeadNoscript": InHeadNoScriptPhase,
+        "afterHead": AfterHeadPhase,
+        "inBody": InBodyPhase,
+        "text": TextPhase,
+        "inTable": InTablePhase,
+        "inTableText": InTableTextPhase,
+        "inCaption": InCaptionPhase,
+        "inColumnGroup": InColumnGroupPhase,
+        "inTableBody": InTableBodyPhase,
+        "inRow": InRowPhase,
+        "inCell": InCellPhase,
+        "inSelect": InSelectPhase,
+        "inSelectInTable": InSelectInTablePhase,
+        "inForeignContent": InForeignContentPhase,
+        "afterBody": AfterBodyPhase,
+        "inFrameset": InFramesetPhase,
+        "afterFrameset": AfterFramesetPhase,
+        "afterAfterBody": AfterAfterBodyPhase,
+        "afterAfterFrameset": AfterAfterFramesetPhase,
+        # XXX after after frameset
+    }
+
+
+def impliedTagToken(name, type="EndTag", attributes=None,
+                    selfClosing=False):
     if attributes is None:
         attributes = {}
-    return {"type":tokenTypes[type], "name":name, "data":attributes,
-            "selfClosing":selfClosing}
+    return {"type": tokenTypes[type], "name": name, "data": attributes,
+            "selfClosing": selfClosing}
+
 
 class ParseError(Exception):
     """Error in parsed document"""
diff --git a/planet/vendor/html5lib/ihatexml.py b/planet/vendor/html5lib/ihatexml.py
index dd78563..0fc7930 100644
--- a/planet/vendor/html5lib/ihatexml.py
+++ b/planet/vendor/html5lib/ihatexml.py
@@ -1,25 +1,105 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import re
+import warnings
+
+from .constants import DataLossWarning
 
-baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+baseChar = """
+[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
+[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
+[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
+[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
+[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
+[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
+[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
+[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
+[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
+[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
+[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
+[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
+[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
+[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
+[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
+[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
+[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
+[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
+[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
+[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
+[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
+[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
+[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
+[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
+[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
+[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
+[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
+[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
+[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
+[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
+#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
+#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
+#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
+[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
+[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
+#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
+[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
+[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
+[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
+[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
+[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
+#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
+[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
+[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
+[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
+[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
 
 ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
 
-combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
+combiningCharacter = """
+[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
+[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
+[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
+[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
+#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
+[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
+[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
+#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
+[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
+[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
+#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
+[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
+[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
+[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
+[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
+[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
+#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
+[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
+#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
+[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
+[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
+#x3099 | #x309A"""
 
-digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+digit = """
+[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
+[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
+[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
+[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
 
-extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+extender = """
+#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
+#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
 
 letter = " | ".join([baseChar, ideographic])
 
-#Without the 
-name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter, 
-                       extender])
+# Without the
+name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
+                   extender])
 nameFirst = " | ".join([letter, "_"])
 
 reChar = re.compile(r"#x([\d|A-F]{4,4})")
 reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
 
+
 def charStringToList(chars):
     charRanges = [item.strip() for item in chars.split(" | ")]
     rv = []
@@ -30,16 +110,17 @@ def charStringToList(chars):
             if match is not None:
                 rv.append([hexToInt(item) for item in match.groups()])
                 if len(rv[-1]) == 1:
-                    rv[-1] = rv[-1]*2
+                    rv[-1] = rv[-1] * 2
                 foundMatch = True
                 break
         if not foundMatch:
             assert len(item) == 1
-            
+
             rv.append([ord(item)] * 2)
     rv = normaliseCharList(rv)
     return rv
 
+
 def normaliseCharList(charList):
     charList = sorted(charList)
     for item in charList:
@@ -49,61 +130,69 @@ def normaliseCharList(charList):
     while i < len(charList):
         j = 1
         rv.append(charList[i])
-        while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
-            rv[-1][1] = charList[i+j][1]
+        while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
+            rv[-1][1] = charList[i + j][1]
             j += 1
         i += j
     return rv
 
-#We don't really support characters above the BMP :(
+# We don't really support characters above the BMP :(
 max_unicode = int("FFFF", 16)
-    
+
+
 def missingRanges(charList):
     rv = []
     if charList[0] != 0:
         rv.append([0, charList[0][0] - 1])
     for i, item in enumerate(charList[:-1]):
-        rv.append([item[1]+1, charList[i+1][0] - 1])
+        rv.append([item[1] + 1, charList[i + 1][0] - 1])
     if charList[-1][1] != max_unicode:
         rv.append([charList[-1][1] + 1, max_unicode])
     return rv
 
+
 def listToRegexpStr(charList):
     rv = []
     for item in charList:
         if item[0] == item[1]:
-           rv.append(escapeRegexp(unichr(item[0])))
+            rv.append(escapeRegexp(chr(item[0])))
         else:
-            rv.append(escapeRegexp(unichr(item[0])) + "-" +
-                      escapeRegexp(unichr(item[1])))
-    return "[%s]"%"".join(rv)
+            rv.append(escapeRegexp(chr(item[0])) + "-" +
+                      escapeRegexp(chr(item[1])))
+    return "[%s]" % "".join(rv)
+
 
 def hexToInt(hex_str):
     return int(hex_str, 16)
 
+
 def escapeRegexp(string):
     specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
-                          "[", "]", "|", "(", ")", "-")
+                         "[", "]", "|", "(", ")", "-")
     for char in specialCharacters:
         string = string.replace(char, "\\" + char)
-        if char in string:
-            print string
 
     return string
 
-#output from the above
-nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+# output from the above
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+
+# Simpler things
+nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
 
-nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
 
 class InfosetFilter(object):
     replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
-    def __init__(self, replaceChars = None,  
-                 dropXmlnsLocalName = False, 
-                 dropXmlnsAttrNs = False,
-                 preventDoubleDashComments = False,
-                 preventDashAtCommentEnd = False,
-                 replaceFormFeedCharacters = True):
+
+    def __init__(self, replaceChars=None,
+                 dropXmlnsLocalName=False,
+                 dropXmlnsAttrNs=False,
+                 preventDoubleDashComments=False,
+                 preventDashAtCommentEnd=False,
+                 replaceFormFeedCharacters=True,
+                 preventSingleQuotePubid=False):
 
         self.dropXmlnsLocalName = dropXmlnsLocalName
         self.dropXmlnsAttrNs = dropXmlnsAttrNs
@@ -113,14 +202,17 @@ def __init__(self, replaceChars = None,
 
         self.replaceFormFeedCharacters = replaceFormFeedCharacters
 
+        self.preventSingleQuotePubid = preventSingleQuotePubid
+
         self.replaceCache = {}
 
     def coerceAttribute(self, name, namespace=None):
         if self.dropXmlnsLocalName and name.startswith("xmlns:"):
-            #Need a datalosswarning here
+            warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
             return None
-        elif (self.dropXmlnsAttrNs and 
+        elif (self.dropXmlnsAttrNs and
               namespace == "http://www.w3.org/2000/xmlns/"):
+            warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
             return None
         else:
             return self.toXmlName(name)
@@ -131,20 +223,35 @@ def coerceElement(self, name, namespace=None):
     def coerceComment(self, data):
         if self.preventDoubleDashComments:
             while "--" in data:
+                warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
                 data = data.replace("--", "- -")
         return data
-    
+
     def coerceCharacters(self, data):
         if self.replaceFormFeedCharacters:
+            for i in range(data.count("\x0C")):
+                warnings.warn("Text cannot contain U+000C", DataLossWarning)
             data = data.replace("\x0C", " ")
-        #Other non-xml characters
+        # Other non-xml characters
         return data
 
+    def coercePubid(self, data):
+        dataOutput = data
+        for char in nonPubidCharRegexp.findall(data):
+            warnings.warn("Coercing non-XML pubid", DataLossWarning)
+            replacement = self.getReplacementCharacter(char)
+            dataOutput = dataOutput.replace(char, replacement)
+        if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
+            warnings.warn("Pubid cannot contain single quote", DataLossWarning)
+            dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
+        return dataOutput
+
     def toXmlName(self, name):
         nameFirst = name[0]
         nameRest = name[1:]
         m = nonXmlNameFirstBMPRegexp.match(nameFirst)
         if m:
+            warnings.warn("Coercing non-XML name", DataLossWarning)
             nameFirstOutput = self.getReplacementCharacter(nameFirst)
         else:
             nameFirstOutput = nameFirst
@@ -152,10 +259,11 @@ def toXmlName(self, name):
         nameRestOutput = nameRest
         replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
         for char in replaceChars:
+            warnings.warn("Coercing non-XML name", DataLossWarning)
             replacement = self.getReplacementCharacter(char)
             nameRestOutput = nameRestOutput.replace(char, replacement)
         return nameFirstOutput + nameRestOutput
-    
+
     def getReplacementCharacter(self, char):
         if char in self.replaceCache:
             replacement = self.replaceCache[char]
@@ -169,9 +277,9 @@ def fromXmlName(self, name):
         return name
 
     def escapeChar(self, char):
-        replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
+        replacement = "U%05X" % ord(char)
         self.replaceCache[char] = replacement
         return replacement
 
     def unescapeChar(self, charcode):
-        return unichr(int(charcode[1:], 16))
+        return chr(int(charcode[1:], 16))
diff --git a/planet/vendor/html5lib/inputstream.py b/planet/vendor/html5lib/inputstream.py
index 46f46b1..9e03b93 100644
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@@ -1,19 +1,34 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+from six.moves import http_client
+
 import codecs
 import re
-import types
-import sys
 
-from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from constants import encodings, ReparseException
-import utils
+from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from .constants import encodings, ReparseException
+from . import utils
+
+from io import StringIO
+
+try:
+    from io import BytesIO
+except ImportError:
+    BytesIO = StringIO
 
-#Non-unicode versions of constants for use in the pre-parser
-spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
-asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
-asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
-spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
+try:
+    from io import BufferedIOBase
+except ImportError:
+    class BufferedIOBase(object):
+        pass
 
-invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+# Non-unicode versions of constants for use in the pre-parser
+spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
+asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
+asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
+
+invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -23,22 +38,23 @@
                                   0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
                                   0x10FFFE, 0x10FFFF])
 
-ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
 
 # Cache for charsUntil()
 charsUntilRegEx = {}
-        
-class BufferedStream:
+
+
+class BufferedStream(object):
     """Buffering for streams that do not have buffering of their own
 
-    The buffer is implemented as a list of chunks on the assumption that 
+    The buffer is implemented as a list of chunks on the assumption that
     joining many strings will be slow since it is O(n**2)
     """
-    
+
     def __init__(self, stream):
         self.stream = stream
         self.buffer = []
-        self.position = [-1,0] #chunk number, offset
+        self.position = [-1, 0]  # chunk number, offset
 
     def tell(self):
         pos = 0
@@ -48,11 +64,11 @@ def tell(self):
         return pos
 
     def seek(self, pos):
-        assert pos < self._bufferedBytes()
+        assert pos <= self._bufferedBytes()
         offset = pos
         i = 0
         while len(self.buffer[i]) < offset:
-            offset -= pos
+            offset -= len(self.buffer[i])
             i += 1
         self.position = [i, offset]
 
@@ -64,7 +80,7 @@ def read(self, bytes):
             return self._readStream(bytes)
         else:
             return self._readFromBuffer(bytes)
-    
+
     def _bufferedBytes(self):
         return sum([len(item) for item in self.buffer])
 
@@ -83,7 +99,7 @@ def _readFromBuffer(self, bytes):
         while bufferIndex < len(self.buffer) and remainingBytes != 0:
             assert remainingBytes > 0
             bufferedData = self.buffer[bufferIndex]
-            
+
             if remainingBytes <= len(bufferedData) - bufferOffset:
                 bytesToRead = remainingBytes
                 self.position = [bufferIndex, bufferOffset + bytesToRead]
@@ -91,20 +107,37 @@ def _readFromBuffer(self, bytes):
                 bytesToRead = len(bufferedData) - bufferOffset
                 self.position = [bufferIndex, len(bufferedData)]
                 bufferIndex += 1
-            data = rv.append(bufferedData[bufferOffset: 
-                                          bufferOffset + bytesToRead])
+            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
             remainingBytes -= bytesToRead
 
             bufferOffset = 0
 
         if remainingBytes:
             rv.append(self._readStream(remainingBytes))
-        
-        return "".join(rv)
-        
+
+        return b"".join(rv)
 
 
-class HTMLInputStream:
+def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
+    if isinstance(source, http_client.HTTPResponse):
+        # Work around Python bug #20007: read(0) closes the connection.
+        # http://bugs.python.org/issue20007
+        isUnicode = False
+    elif hasattr(source, "read"):
+        isUnicode = isinstance(source.read(0), text_type)
+    else:
+        isUnicode = isinstance(source, text_type)
+
+    if isUnicode:
+        if encoding is not None:
+            raise TypeError("Cannot explicitly set an encoding with a unicode string")
+
+        return HTMLUnicodeInputStream(source)
+    else:
+        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
+
+
+class HTMLUnicodeInputStream(object):
     """Provides a unicode stream of characters to the HTMLTokenizer.
 
     This class takes care of character encoding and removing or replacing
@@ -114,7 +147,7 @@ class HTMLInputStream:
 
     _defaultChunkSize = 10240
 
-    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+    def __init__(self, source):
         """Initialises the HTMLInputStream.
 
         HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -126,46 +159,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         the encoding.  If specified, that encoding will be used,
         regardless of any BOM or later declaration (such as in a meta
         element)
-        
+
         parseMeta - Look for a <meta> element containing encoding information
 
         """
 
-        #Craziness
-        if len(u"\U0010FFFF") == 1:
+        # Craziness
+        if len("\U0010FFFF") == 1:
             self.reportCharacterErrors = self.characterErrorsUCS4
+            self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
         else:
             self.reportCharacterErrors = self.characterErrorsUCS2
+            self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
 
         # List of where new lines occur
         self.newLines = [0]
 
-        self.charEncoding = (codecName(encoding), "certain")
-
-        # Raw Stream - for unicode objects this will encode to utf-8 and set
-        #              self.charEncoding as appropriate
-        self.rawStream = self.openStream(source)
-
-        # Encoding Information
-        #Number of bytes to use when looking for a meta element with
-        #encoding information
-        self.numBytesMeta = 512
-        #Number of bytes to use when using detecting encoding using chardet
-        self.numBytesChardet = 100
-        #Encoding to use if no other information can be found
-        self.defaultEncoding = "windows-1252"
-        
-        #Detect encoding iff no explicit "transport level" encoding is supplied
-        if (self.charEncoding[0] is None):
-            self.charEncoding = self.detectEncoding(parseMeta, chardet)
+        self.charEncoding = ("utf-8", "certain")
+        self.dataStream = self.openStream(source)
 
         self.reset()
 
     def reset(self):
-        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
-                                                                 'replace')
-
-        self.chunk = u""
+        self.chunk = ""
         self.chunkSize = 0
         self.chunkOffset = 0
         self.errors = []
@@ -174,9 +190,9 @@ def reset(self):
         self.prevNumLines = 0
         # number of columns in the last line of the previous chunk
         self.prevNumCols = 0
-        
-        #Flag to indicate we may have a CR LF broken across a data chunk
-        self._lastChunkEndsWithCR = False
+
+        # Deal with CR LF and surrogates split over chunk boundaries
+        self._bufferedCharacter = None
 
     def openStream(self, source):
         """Produces a file object from source.
@@ -188,122 +204,15 @@ def openStream(self, source):
         if hasattr(source, 'read'):
             stream = source
         else:
-            # Otherwise treat source as a string and convert to a file object
-            if isinstance(source, unicode):
-                source = source.encode('utf-8')
-                self.charEncoding = ("utf-8", "certain")
-            import cStringIO
-            stream = cStringIO.StringIO(str(source))
-
-        if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
-            stream is sys.stdin):
-            stream = BufferedStream(stream)
+            stream = StringIO(source)
 
         return stream
 
-    def detectEncoding(self, parseMeta=True, chardet=True):
-        #First look for a BOM
-        #This will also read past the BOM if present
-        encoding = self.detectBOM()
-        confidence = "certain"
-        #If there is no BOM need to look for meta elements with encoding 
-        #information
-        if encoding is None and parseMeta:
-            encoding = self.detectEncodingMeta()
-            confidence = "tentative"
-        #Guess with chardet, if avaliable
-        if encoding is None and chardet:
-            confidence = "tentative"
-            try:
-                from chardet.universaldetector import UniversalDetector
-                buffers = []
-                detector = UniversalDetector()
-                while not detector.done:
-                    buffer = self.rawStream.read(self.numBytesChardet)
-                    if not buffer:
-                        break
-                    buffers.append(buffer)
-                    detector.feed(buffer)
-                detector.close()
-                encoding = detector.result['encoding']
-                self.rawStream.seek(0)
-            except ImportError:
-                pass
-        # If all else fails use the default encoding
-        if encoding is None:
-            confidence="tentative"
-            encoding = self.defaultEncoding
-        
-        #Substitute for equivalent encodings:
-        encodingSub = {"iso-8859-1":"windows-1252"}
-
-        if encoding.lower() in encodingSub:
-            encoding = encodingSub[encoding.lower()]
-
-        return encoding, confidence
-
-    def changeEncoding(self, newEncoding):
-        newEncoding = codecName(newEncoding)
-        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
-            newEncoding = "utf-8"
-        if newEncoding is None:
-            return
-        elif newEncoding == self.charEncoding[0]:
-            self.charEncoding = (self.charEncoding[0], "certain")
-        else:
-            self.rawStream.seek(0)
-            self.reset()
-            self.charEncoding = (newEncoding, "certain")
-            raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
-            
-    def detectBOM(self):
-        """Attempts to detect at BOM at the start of the stream. If
-        an encoding can be determined from the BOM return the name of the
-        encoding otherwise return None"""
-        bomDict = {
-            codecs.BOM_UTF8: 'utf-8',
-            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
-            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
-        }
-
-        # Go to beginning of file and read in 4 bytes
-        string = self.rawStream.read(4)
-
-        # Try detecting the BOM using bytes from the string
-        encoding = bomDict.get(string[:3])         # UTF-8
-        seek = 3
-        if not encoding:
-            # Need to detect UTF-32 before UTF-16
-            encoding = bomDict.get(string)         # UTF-32
-            seek = 4
-            if not encoding:
-                encoding = bomDict.get(string[:2]) # UTF-16
-                seek = 2
-
-        # Set the read position past the BOM if one was found, otherwise
-        # set it to the start of the stream
-        self.rawStream.seek(encoding and seek or 0)
-
-        return encoding
-
-    def detectEncodingMeta(self):
-        """Report the encoding declared by the meta element
-        """
-        buffer = self.rawStream.read(self.numBytesMeta)
-        parser = EncodingParser(buffer)
-        self.rawStream.seek(0)
-        encoding = parser.getEncoding()
-        
-        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
-            encoding = "utf-8"
-
-        return encoding
-
     def _position(self, offset):
         chunk = self.chunk
-        nLines = chunk.count(u'\n', 0, offset)
+        nLines = chunk.count('\n', 0, offset)
         positionLine = self.prevNumLines + nLines
-        lastLinePos = chunk.rfind(u'\n', 0, offset)
+        lastLinePos = chunk.rfind('\n', 0, offset)
         if lastLinePos == -1:
             positionColumn = self.prevNumCols + offset
         else:
@@ -313,7 +222,7 @@ def _position(self, offset):
     def position(self):
         """Returns (line, col) of the current position in the stream."""
         line, col = self._position(self.chunkOffset)
-        return (line+1, col)
+        return (line + 1, col)
 
     def char(self):
         """ Read one character from the stream or queue if available. Return
@@ -336,27 +245,34 @@ def readChunk(self, chunkSize=None):
 
         self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
 
-        self.chunk = u""
+        self.chunk = ""
         self.chunkSize = 0
         self.chunkOffset = 0
 
         data = self.dataStream.read(chunkSize)
 
-        if not data:
+        # Deal with CR LF and surrogates broken across chunks
+        if self._bufferedCharacter:
+            data = self._bufferedCharacter + data
+            self._bufferedCharacter = None
+        elif not data:
+            # We have no more data, bye-bye stream
             return False
-        
+
+        if len(data) > 1:
+            lastv = ord(data[-1])
+            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
+                self._bufferedCharacter = data[-1]
+                data = data[:-1]
+
         self.reportCharacterErrors(data)
 
-        data = data.replace(u"\u0000", u"\ufffd")
-        #Check for CR LF broken across chunks
-        if (self._lastChunkEndsWithCR and data[0] == u"\n"):
-            data = data[1:]
-            # Stop if the chunk is now empty
-            if not data:
-                return False
-        self._lastChunkEndsWithCR = data[-1] == u"\r"
-        data = data.replace(u"\r\n", u"\n")
-        data = data.replace(u"\r", u"\n")
+        # Replace invalid characters
+        # Note U+0000 is dealt with in the tokenizer
+        data = self.replaceCharactersRegexp.sub("\ufffd", data)
+
+        data = data.replace("\r\n", "\n")
+        data = data.replace("\r", "\n")
 
         self.chunk = data
         self.chunkSize = len(data)
@@ -364,27 +280,22 @@ def readChunk(self, chunkSize=None):
         return True
 
     def characterErrorsUCS4(self, data):
-        for i in xrange(data.count(u"\u0000")):
-            self.errors.append("null-character")
-        for i in xrange(len(invalid_unicode_re.findall(data))):
+        for i in range(len(invalid_unicode_re.findall(data))):
             self.errors.append("invalid-codepoint")
 
     def characterErrorsUCS2(self, data):
-        #Someone picked the wrong compile option
-        #You lose
-        for i in xrange(data.count(u"\u0000")):
-            self.errors.append("null-character")
+        # Someone picked the wrong compile option
+        # You lose
         skip = False
-        import sys
         for match in invalid_unicode_re.finditer(data):
             if skip:
                 continue
             codepoint = ord(match.group())
             pos = match.start()
-            #Pretty sure there should be endianness issues here
-            if utils.isSurrogatePair(data[pos:pos+2]):
-                #We have a surrogate pair!
-                char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
+            # Pretty sure there should be endianness issues here
+            if utils.isSurrogatePair(data[pos:pos + 2]):
+                # We have a surrogate pair!
+                char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
                 if char_val in non_bmp_invalid_codepoints:
                     self.errors.append("invalid-codepoint")
                 skip = True
@@ -394,10 +305,8 @@ def characterErrorsUCS2(self, data):
             else:
                 skip = False
                 self.errors.append("invalid-codepoint")
-        #This is still wrong if it is possible for a surrogate pair to break a
-        #chunk boundary
 
-    def charsUntil(self, characters, opposite = False):
+    def charsUntil(self, characters, opposite=False):
         """ Returns a string of characters from the stream up to but not
         including any character in 'characters' or EOF. 'characters' must be
         a container that supports the 'in' method and iteration over its
@@ -409,12 +318,12 @@ def charsUntil(self, characters, opposite = False):
             chars = charsUntilRegEx[(characters, opposite)]
         except KeyError:
             if __debug__:
-                for c in characters: 
+                for c in characters:
                     assert(ord(c) < 128)
-            regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
+            regex = "".join(["\\x%02x" % ord(c) for c in characters])
             if not opposite:
-                regex = u"^%s" % regex
-            chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
+                regex = "^%s" % regex
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
 
         rv = []
 
@@ -441,27 +350,12 @@ def charsUntil(self, characters, opposite = False):
                 # Reached EOF
                 break
 
-        r = u"".join(rv)
-        return r
-
-    def charsUntilEOF(self):
-        """ Returns a string of characters from the stream up to EOF."""
-
-        rv = []
-
-        while True:
-            rv.append(self.chunk[self.chunkOffset:])
-            if not self.readChunk():
-                # Reached EOF
-                break
-
-        r = u"".join(rv)
+        r = "".join(rv)
         return r
 
     def unget(self, char):
         # Only one character is allowed to be ungotten at once - it must
         # be consumed again before any further call to unget
-
         if char is not None:
             if self.chunkOffset == 0:
                 # unget is called quite rarely, so it's a good idea to do
@@ -475,26 +369,210 @@ def unget(self, char):
                 self.chunkOffset -= 1
                 assert self.chunk[self.chunkOffset] == char
 
-class EncodingBytes(str):
+
+class HTMLBinaryInputStream(HTMLUnicodeInputStream):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        parseMeta - Look for a <meta> element containing encoding information
+
+        """
+        # Raw Stream - for unicode objects this will encode to utf-8 and set
+        #              self.charEncoding as appropriate
+        self.rawStream = self.openStream(source)
+
+        HTMLUnicodeInputStream.__init__(self, self.rawStream)
+
+        self.charEncoding = (codecName(encoding), "certain")
+
+        # Encoding Information
+        # Number of bytes to use when looking for a meta element with
+        # encoding information
+        self.numBytesMeta = 512
+        # Number of bytes to use when using detecting encoding using chardet
+        self.numBytesChardet = 100
+        # Encoding to use if no other information can be found
+        self.defaultEncoding = "windows-1252"
+
+        # Detect encoding iff no explicit "transport level" encoding is supplied
+        if (self.charEncoding[0] is None):
+            self.charEncoding = self.detectEncoding(parseMeta, chardet)
+
+        # Call superclass
+        self.reset()
+
+    def reset(self):
+        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
+                                                                 'replace')
+        HTMLUnicodeInputStream.reset(self)
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            stream = BytesIO(source)
+
+        try:
+            stream.seek(stream.tell())
+        except:
+            stream = BufferedStream(stream)
+
+        return stream
+
+    def detectEncoding(self, parseMeta=True, chardet=True):
+        # First look for a BOM
+        # This will also read past the BOM if present
+        encoding = self.detectBOM()
+        confidence = "certain"
+        # If there is no BOM need to look for meta elements with encoding
+        # information
+        if encoding is None and parseMeta:
+            encoding = self.detectEncodingMeta()
+            confidence = "tentative"
+        # Guess with chardet, if avaliable
+        if encoding is None and chardet:
+            confidence = "tentative"
+            try:
+                try:
+                    from charade.universaldetector import UniversalDetector
+                except ImportError:
+                    from chardet.universaldetector import UniversalDetector
+                buffers = []
+                detector = UniversalDetector()
+                while not detector.done:
+                    buffer = self.rawStream.read(self.numBytesChardet)
+                    assert isinstance(buffer, bytes)
+                    if not buffer:
+                        break
+                    buffers.append(buffer)
+                    detector.feed(buffer)
+                detector.close()
+                encoding = detector.result['encoding']
+                self.rawStream.seek(0)
+            except ImportError:
+                pass
+        # If all else fails use the default encoding
+        if encoding is None:
+            confidence = "tentative"
+            encoding = self.defaultEncoding
+
+        # Substitute for equivalent encodings:
+        encodingSub = {"iso-8859-1": "windows-1252"}
+
+        if encoding.lower() in encodingSub:
+            encoding = encodingSub[encoding.lower()]
+
+        return encoding, confidence
+
+    def changeEncoding(self, newEncoding):
+        assert self.charEncoding[1] != "certain"
+        newEncoding = codecName(newEncoding)
+        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            newEncoding = "utf-8"
+        if newEncoding is None:
+            return
+        elif newEncoding == self.charEncoding[0]:
+            self.charEncoding = (self.charEncoding[0], "certain")
+        else:
+            self.rawStream.seek(0)
+            self.reset()
+            self.charEncoding = (newEncoding, "certain")
+            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
+
+    def detectBOM(self):
+        """Attempts to detect at BOM at the start of the stream. If
+        an encoding can be determined from the BOM return the name of the
+        encoding otherwise return None"""
+        bomDict = {
+            codecs.BOM_UTF8: 'utf-8',
+            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
+            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        string = self.rawStream.read(4)
+        assert isinstance(string, bytes)
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict.get(string[:3])         # UTF-8
+        seek = 3
+        if not encoding:
+            # Need to detect UTF-32 before UTF-16
+            encoding = bomDict.get(string)         # UTF-32
+            seek = 4
+            if not encoding:
+                encoding = bomDict.get(string[:2])  # UTF-16
+                seek = 2
+
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        self.rawStream.seek(encoding and seek or 0)
+
+        return encoding
+
+    def detectEncodingMeta(self):
+        """Report the encoding declared by the meta element
+        """
+        buffer = self.rawStream.read(self.numBytesMeta)
+        assert isinstance(buffer, bytes)
+        parser = EncodingParser(buffer)
+        self.rawStream.seek(0)
+        encoding = parser.getEncoding()
+
+        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            encoding = "utf-8"
+
+        return encoding
+
+
+class EncodingBytes(bytes):
     """String-like object with an associated position and various extra methods
     If the position is ever greater than the string length then an exception is
     raised"""
     def __new__(self, value):
-        return str.__new__(self, value.lower())
+        assert isinstance(value, bytes)
+        return bytes.__new__(self, value.lower())
 
     def __init__(self, value):
-        self._position=-1
-    
+        self._position = -1
+
     def __iter__(self):
         return self
-    
-    def next(self):
+
+    def __next__(self):
         p = self._position = self._position + 1
         if p >= len(self):
             raise StopIteration
         elif p < 0:
             raise TypeError
-        return self[p]
+        return self[p:p + 1]
+
+    def next(self):
+        # Py2 compat
+        return self.__next__()
 
     def previous(self):
         p = self._position
@@ -503,13 +581,13 @@ def previous(self):
         elif p < 0:
             raise TypeError
         self._position = p = p - 1
-        return self[p]
-    
+        return self[p:p + 1]
+
     def setPosition(self, position):
         if self._position >= len(self):
             raise StopIteration
         self._position = position
-    
+
     def getPosition(self):
         if self._position >= len(self):
             raise StopIteration
@@ -517,19 +595,19 @@ def getPosition(self):
             return self._position
         else:
             return None
-    
+
     position = property(getPosition, setPosition)
 
     def getCurrentByte(self):
-        return self[self.position]
-    
+        return self[self.position:self.position + 1]
+
     currentByte = property(getCurrentByte)
 
     def skip(self, chars=spaceCharactersBytes):
         """Skip past a list of characters"""
         p = self.position               # use property for the error-checking
         while p < len(self):
-            c = self[p]
+            c = self[p:p + 1]
             if c not in chars:
                 self._position = p
                 return c
@@ -540,7 +618,7 @@ def skip(self, chars=spaceCharactersBytes):
     def skipUntil(self, chars):
         p = self.position
         while p < len(self):
-            c = self[p]
+            c = self[p:p + 1]
             if c in chars:
                 self._position = p
                 return c
@@ -549,16 +627,16 @@ def skipUntil(self, chars):
         return None
 
     def matchBytes(self, bytes):
-        """Look for a sequence of bytes at the start of a string. If the bytes 
-        are found return True and advance the position to the byte after the 
+        """Look for a sequence of bytes at the start of a string. If the bytes
+        are found return True and advance the position to the byte after the
         match. Otherwise return False and leave the position alone"""
         p = self.position
-        data = self[p:p+len(bytes)]
+        data = self[p:p + len(bytes)]
         rv = data.startswith(bytes)
         if rv:
             self.position += len(bytes)
         return rv
-    
+
     def jumpTo(self, bytes):
         """Look for the next sequence of bytes matching a given sequence. If
         a match is found advance the position to the last byte of the match"""
@@ -567,11 +645,12 @@ def jumpTo(self, bytes):
             # XXX: This is ugly, but I can't see a nicer way to fix this.
             if self._position == -1:
                 self._position = 0
-            self._position += (newPosition + len(bytes)-1)
+            self._position += (newPosition + len(bytes) - 1)
             return True
         else:
             raise StopIteration
 
+
 class EncodingParser(object):
     """Mini parser for detecting character encoding from meta elements"""
 
@@ -582,147 +661,158 @@ def __init__(self, data):
 
     def getEncoding(self):
         methodDispatch = (
-            ("<!--",self.handleComment),
-            ("<meta",self.handleMeta),
-            ("</",self.handlePossibleEndTag),
-            ("<!",self.handleOther),
-            ("<?",self.handleOther),
-            ("<",self.handlePossibleStartTag))
+            (b"<!--", self.handleComment),
+            (b"<meta", self.handleMeta),
+            (b"</", self.handlePossibleEndTag),
+            (b"<!", self.handleOther),
+            (b"<?", self.handleOther),
+            (b"<", self.handlePossibleStartTag))
         for byte in self.data:
             keepParsing = True
             for key, method in methodDispatch:
                 if self.data.matchBytes(key):
                     try:
-                        keepParsing = method()    
+                        keepParsing = method()
                         break
                     except StopIteration:
-                        keepParsing=False
+                        keepParsing = False
                         break
             if not keepParsing:
                 break
-        
+
         return self.encoding
 
     def handleComment(self):
         """Skip over comments"""
-        return self.data.jumpTo("-->")
+        return self.data.jumpTo(b"-->")
 
     def handleMeta(self):
         if self.data.currentByte not in spaceCharactersBytes:
-            #if we have <meta not followed by a space so just keep going
+            # if we have <meta not followed by a space so just keep going
             return True
-        #We have a valid meta element we want to search for attributes
+        # We have a valid meta element we want to search for attributes
+        hasPragma = False
+        pendingEncoding = None
         while True:
-            #Try to find the next attribute after the current position
+            # Try to find the next attribute after the current position
             attr = self.getAttribute()
             if attr is None:
                 return True
             else:
-                if attr[0] == "charset":
+                if attr[0] == b"http-equiv":
+                    hasPragma = attr[1] == b"content-type"
+                    if hasPragma and pendingEncoding is not None:
+                        self.encoding = pendingEncoding
+                        return False
+                elif attr[0] == b"charset":
                     tentativeEncoding = attr[1]
                     codec = codecName(tentativeEncoding)
                     if codec is not None:
                         self.encoding = codec
                         return False
-                elif attr[0] == "content":
+                elif attr[0] == b"content":
                     contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                     tentativeEncoding = contentParser.parse()
-                    codec = codecName(tentativeEncoding)
-                    if codec is not None:
-                        self.encoding = codec
-                        return False
+                    if tentativeEncoding is not None:
+                        codec = codecName(tentativeEncoding)
+                        if codec is not None:
+                            if hasPragma:
+                                self.encoding = codec
+                                return False
+                            else:
+                                pendingEncoding = codec
 
     def handlePossibleStartTag(self):
         return self.handlePossibleTag(False)
 
     def handlePossibleEndTag(self):
-        self.data.next()
+        next(self.data)
         return self.handlePossibleTag(True)
 
     def handlePossibleTag(self, endTag):
         data = self.data
         if data.currentByte not in asciiLettersBytes:
-            #If the next byte is not an ascii letter either ignore this
-            #fragment (possible start tag case) or treat it according to 
-            #handleOther
+            # If the next byte is not an ascii letter either ignore this
+            # fragment (possible start tag case) or treat it according to
+            # handleOther
             if endTag:
                 data.previous()
                 self.handleOther()
             return True
-        
+
         c = data.skipUntil(spacesAngleBrackets)
-        if c == "<":
-            #return to the first step in the overall "two step" algorithm
-            #reprocessing the < byte
+        if c == b"<":
+            # return to the first step in the overall "two step" algorithm
+            # reprocessing the < byte
             data.previous()
         else:
-            #Read all attributes
+            # Read all attributes
             attr = self.getAttribute()
             while attr is not None:
                 attr = self.getAttribute()
         return True
 
     def handleOther(self):
-        return self.data.jumpTo(">")
+        return self.data.jumpTo(b">")
 
     def getAttribute(self):
-        """Return a name,value pair for the next attribute in the stream, 
+        """Return a name,value pair for the next attribute in the stream,
         if one is found, or None"""
         data = self.data
         # Step 1 (skip chars)
-        c = data.skip(spaceCharactersBytes | frozenset("/"))
+        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
+        assert c is None or len(c) == 1
         # Step 2
-        if c in (">", None):
+        if c in (b">", None):
             return None
         # Step 3
         attrName = []
         attrValue = []
-        #Step 4 attribute name
+        # Step 4 attribute name
         while True:
-            if c == "=" and attrName:   
+            if c == b"=" and attrName:
                 break
             elif c in spaceCharactersBytes:
-                #Step 6!
+                # Step 6!
                 c = data.skip()
-                c = data.next()
                 break
-            elif c in ("/", ">"):
-                return "".join(attrName), ""
+            elif c in (b"/", b">"):
+                return b"".join(attrName), b""
             elif c in asciiUppercaseBytes:
                 attrName.append(c.lower())
-            elif c == None:
+            elif c is None:
                 return None
             else:
                 attrName.append(c)
-            #Step 5
-            c = data.next()
-        #Step 7
-        if c != "=":
+            # Step 5
+            c = next(data)
+        # Step 7
+        if c != b"=":
             data.previous()
-            return "".join(attrName), ""
-        #Step 8
-        data.next()
-        #Step 9
+            return b"".join(attrName), b""
+        # Step 8
+        next(data)
+        # Step 9
         c = data.skip()
-        #Step 10
-        if c in ("'", '"'):
-            #10.1
+        # Step 10
+        if c in (b"'", b'"'):
+            # 10.1
             quoteChar = c
             while True:
-                #10.2
-                c = data.next()
-                #10.3
+                # 10.2
+                c = next(data)
+                # 10.3
                 if c == quoteChar:
-                    data.next()
-                    return "".join(attrName), "".join(attrValue)
-                #10.4
+                    next(data)
+                    return b"".join(attrName), b"".join(attrValue)
+                # 10.4
                 elif c in asciiUppercaseBytes:
                     attrValue.append(c.lower())
-                #10.5
+                # 10.5
                 else:
                     attrValue.append(c)
-        elif c == ">":
-            return "".join(attrName), ""
+        elif c == b">":
+            return b"".join(attrName), b""
         elif c in asciiUppercaseBytes:
             attrValue.append(c.lower())
         elif c is None:
@@ -731,9 +821,9 @@ def getAttribute(self):
             attrValue.append(c)
         # Step 11
         while True:
-            c = data.next()
+            c = next(data)
             if c in spacesAngleBrackets:
-                return "".join(attrName), "".join(attrValue)
+                return b"".join(attrName), b"".join(attrValue)
             elif c in asciiUppercaseBytes:
                 attrValue.append(c.lower())
             elif c is None:
@@ -744,21 +834,23 @@ def getAttribute(self):
 
 class ContentAttrParser(object):
     def __init__(self, data):
+        assert isinstance(data, bytes)
         self.data = data
+
     def parse(self):
         try:
-            #Check if the attr name is charset 
-            #otherwise return
-            self.data.jumpTo("charset")
+            # Check if the attr name is charset
+            # otherwise return
+            self.data.jumpTo(b"charset")
             self.data.position += 1
             self.data.skip()
-            if not self.data.currentByte == "=":
-                #If there is no = sign keep looking for attrs
+            if not self.data.currentByte == b"=":
+                # If there is no = sign keep looking for attrs
                 return None
             self.data.position += 1
             self.data.skip()
-            #Look for an encoding between matching quote marks
-            if self.data.currentByte in ('"', "'"):
+            # Look for an encoding between matching quote marks
+            if self.data.currentByte in (b'"', b"'"):
                 quoteMark = self.data.currentByte
                 self.data.position += 1
                 oldPosition = self.data.position
@@ -767,13 +859,13 @@ def parse(self):
                 else:
                     return None
             else:
-                #Unquoted value
+                # Unquoted value
                 oldPosition = self.data.position
                 try:
                     self.data.skipUntil(spaceCharactersBytes)
                     return self.data[oldPosition:self.data.position]
                 except StopIteration:
-                    #Return the whole remaining value
+                    # Return the whole remaining value
                     return self.data[oldPosition:]
         except StopIteration:
             return None
@@ -782,7 +874,12 @@ def parse(self):
 def codecName(encoding):
     """Return the python codec name corresponding to an encoding or None if the
     string doesn't correspond to a valid encoding."""
-    if (encoding is not None and type(encoding) in types.StringTypes):
+    if isinstance(encoding, bytes):
+        try:
+            encoding = encoding.decode("ascii")
+        except UnicodeDecodeError:
+            return None
+    if encoding:
         canonicalName = ascii_punctuation_re.sub("", encoding).lower()
         return encodings.get(canonicalName, None)
     else:
diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py
index 05face9..47b03c8 100644
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@@ -1,8 +1,11 @@
+from __future__ import absolute_import, division, unicode_literals
+
 import re
 from xml.sax.saxutils import escape, unescape
 
-from tokenizer import HTMLTokenizer
-from constants import tokenTypes
+from .tokenizer import HTMLTokenizer
+from .constants import tokenTypes
+
 
 class HTMLSanitizerMixin(object):
     """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
@@ -12,28 +15,28 @@ class HTMLSanitizerMixin(object):
         'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
         'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
         'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
-        'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4',
-        'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'keygen', 'kbd',
-        'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol',
-        'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre',
-        'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound',
-        'source', 'spacer', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
-        'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', 'tr', 'tt',
-        'u', 'ul', 'var', 'video']
+        'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
+        'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
+        'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+        'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+        'output', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+        'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+        'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+        'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
       
-    mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
+    mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math', 'merror', 'mfenced', 'mfrac', 'mi',
         'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
         'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
         'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
-        'munderover', 'none']
+        'munderover', 'none', 'semantics']
       
     svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
-        'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
-        'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
-        'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
-        'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
-        'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
-        
+                    'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
+                    'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
+                    'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
+                    'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
+                    'set', 'stop', 'svg', 'switch', 'symbol', 'text', 'title', 'tspan', 'use']
+
     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
         'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
         'background', 'balance', 'bgcolor', 'bgproperties', 'border',
@@ -49,8 +52,8 @@ class HTMLSanitizerMixin(object):
         'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
         'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
         'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
-        'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
-        'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
+        'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
+        'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
         'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
         'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
         'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
@@ -58,14 +61,14 @@ class HTMLSanitizerMixin(object):
         'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
         'width', 'wrap', 'xml:lang']
 
-    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
-        'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
-        'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
+    mathml_attributes = ['actiontype', 'align', 'columnalign',
+        'columnlines', 'columnspacing', 'columnspan', 'depth',
+        'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', 'fence',
         'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
-        'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
-        'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
-        'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
-        'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
+        'mathbackground', 'mathcolor', 'mathvariant', 'maxsize',
+        'minsize', 'other', 'rowalign', 'rowlines', 'rowspacing',
+        'rowspan', 'rspace', 'scriptlevel', 'selection',
+        'separator', 'separators', 'stretchy', 'width', 'xlink:href', 'xlink:show',
         'xlink:type', 'xmlns', 'xmlns:xlink']
   
     svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
@@ -80,7 +83,7 @@ class HTMLSanitizerMixin(object):
         'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
         'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
         'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
-        'opacity', 'orient', 'origin', 'overline-position',
+        'opacity', 'orient', 'origin', 'overflow', 'overline-position',
         'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
         'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
         'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
@@ -97,46 +100,46 @@ class HTMLSanitizerMixin(object):
         'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
         'y1', 'y2', 'zoomAndPan']
 
-    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
+    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
         'xlink:href', 'xml:base']
 
     svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
-        'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
-        'mask', 'stroke']
+                               'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
+                               'mask', 'stroke']
 
     svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
-        'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
-        'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
-        'set', 'use']
-  
+                            'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
+                            'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
+                            'set', 'use']
+
     acceptable_css_properties = ['azimuth', 'background-color',
-        'border-bottom-color', 'border-collapse', 'border-color',
-        'border-left-color', 'border-right-color', 'border-top-color', 'clear',
-        'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
-        'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
-        'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
-        'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
-        'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
-        'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
-        'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
-        'white-space', 'width']
-  
+                                 'border-bottom-color', 'border-collapse', 'border-color',
+                                 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+                                 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+                                 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+                                 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+                                 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+                                 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+                                 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+                                 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+                                 'white-space', 'width']
+
     acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
-        'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
-        'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
-        'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
-        'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
-        'transparent', 'underline', 'white', 'yellow']
-  
-    acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
-        'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
-        'stroke-opacity']
-  
-    acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
-        'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
-        'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
-        'ssh', 'sftp', 'rtsp', 'afs' ]
-  
+                               'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+                               'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+                               'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+                               'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+                               'transparent', 'underline', 'white', 'yellow']
+
+    acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
+                                 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+                                 'stroke-opacity']
+
+    acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
+                            'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
+                            'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
+                            'ssh', 'sftp', 'rtsp', 'afs']
+
     # subclasses may define their own versions of these constants
     allowed_elements = acceptable_elements + mathml_elements + svg_elements
     allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
@@ -160,18 +163,18 @@ def sanitize_token(self, token):
 
         # accommodate filters which use token_type differently
         token_type = token["type"]
-        if token_type in tokenTypes.keys():
-          token_type = tokenTypes[token_type]
+        if token_type in list(tokenTypes.keys()):
+            token_type = tokenTypes[token_type]
 
-        if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"], 
-                             tokenTypes["EmptyTag"]):
+        if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
+                          tokenTypes["EmptyTag"]):
             if token["name"] in self.allowed_elements:
-                if token.has_key("data"):
+                if "data" in token:
                     attrs = dict([(name,val) for name,val in
                                   token["data"][::-1] 
                                   if name in self.allowed_attributes])
                     for attr in self.attr_val_is_uri:
-                        if not attrs.has_key(attr):
+                        if attr not in attrs:
                             continue
                         val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                                unescape(attrs[attr])).lower()
@@ -190,66 +193,103 @@ def sanitize_token(self, token):
                         'xlink:href' in attrs and re.search('^\s*[^#\s].*',
                                                             attrs['xlink:href'])):
                         del attrs['xlink:href']
-                    if attrs.has_key('style'):
+                    if 'style' in attrs:
                         attrs['style'] = self.sanitize_css(attrs['style'])
                     token["data"] = [[name,val] for name,val in attrs.items()]
                 return token
             else:
-                if token_type == tokenTypes["EndTag"]:
-                    token["data"] = "</%s>" % token["name"]
-                elif token["data"]:
-                    attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
-                    token["data"] = "<%s%s>" % (token["name"],attrs)
-                else:
-                    token["data"] = "<%s>" % token["name"]
-                if token.get("selfClosing"):
-                    token["data"]=token["data"][:-1] + "/>"
-
-                if token["type"] in tokenTypes.keys():
-                    token["type"] = "Characters"
-                else:
-                    token["type"] = tokenTypes["Characters"]
-
-                del token["name"]
-                return token
+                return self.disallowed_token(token, token_type)
         elif token_type == tokenTypes["Comment"]:
             pass
         else:
             return token
 
+    def allowed_token(self, token, token_type):
+        if "data" in token:
+            attrs = dict([(name, val) for name, val in
+                          token["data"][::-1]
+                          if name in self.allowed_attributes])
+            for attr in self.attr_val_is_uri:
+                if attr not in attrs:
+                    continue
+                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+                                       unescape(attrs[attr])).lower()
+                # remove replacement characters from unescaped characters
+                val_unescaped = val_unescaped.replace("\ufffd", "")
+                if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
+                    (val_unescaped.split(':')[0] not in
+                     self.allowed_protocols)):
+                    del attrs[attr]
+            for attr in self.svg_attr_val_allows_ref:
+                if attr in attrs:
+                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                         ' ',
+                                         unescape(attrs[attr]))
+            if (token["name"] in self.svg_allow_local_href and
+                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
+                                                    attrs['xlink:href'])):
+                del attrs['xlink:href']
+            if 'style' in attrs:
+                attrs['style'] = self.sanitize_css(attrs['style'])
+            token["data"] = [[name, val] for name, val in list(attrs.items())]
+        return token
+
+    def disallowed_token(self, token, token_type):
+        if token_type == tokenTypes["EndTag"]:
+            token["data"] = "</%s>" % token["name"]
+        elif token["data"]:
+            attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
+            token["data"] = "<%s%s>" % (token["name"], attrs)
+        else:
+            token["data"] = "<%s>" % token["name"]
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        if token["type"] in list(tokenTypes.keys()):
+            token["type"] = "Characters"
+        else:
+            token["type"] = tokenTypes["Characters"]
+
+        del token["name"]
+        return token
+
     def sanitize_css(self, style):
         # disallow urls
-        style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
 
         # gauntlet
-        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
+        if not re.match("""^([-:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,.%\s]+\))*$""", style):
+            return ''
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
 
         clean = []
-        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
-          if not value: continue
-          if prop.lower() in self.allowed_css_properties:
-              clean.append(prop + ': ' + value + ';')
-          elif prop.split('-')[0].lower() in ['background','border','margin',
-                                              'padding']:
-              for keyword in value.split():
-                  if not keyword in self.acceptable_css_keywords and \
-                      not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
-                      break
-              else:
-                  clean.append(prop + ': ' + value + ';')
-          elif prop.lower() in self.allowed_svg_properties:
-              clean.append(prop + ': ' + value + ';')
+        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
+            if not value:
+                continue
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
+                                                'padding']:
+                for keyword in value.split():
+                    if not keyword in self.acceptable_css_keywords and \
+                            not re.match("^(#[0-9a-f]+|rgb\([\d.]+%?,[\d.]*%?,?[\d.]*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
+                        break
+                else:
+                    clean.append(prop + ': ' + value + ';')
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
 
         return ' '.join(clean)
 
+
 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
     def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=False, lowercaseAttrName=False):
-        #Change case matching defaults as we only output lowercase html anyway
-        #This solution doesn't seem ideal...
+                 lowercaseElementName=False, lowercaseAttrName=False, parser=None):
+        # Change case matching defaults as we only output lowercase html anyway
+        # This solution doesn't seem ideal...
         HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
-                               lowercaseElementName, lowercaseAttrName)
+                               lowercaseElementName, lowercaseAttrName, parser=parser)
 
     def __iter__(self):
         for token in HTMLTokenizer.__iter__(self):
diff --git a/planet/vendor/html5lib/serializer/__init__.py b/planet/vendor/html5lib/serializer/__init__.py
index 1b74665..8380839 100644
--- a/planet/vendor/html5lib/serializer/__init__.py
+++ b/planet/vendor/html5lib/serializer/__init__.py
@@ -1,17 +1,16 @@
+from __future__ import absolute_import, division, unicode_literals
 
-from html5lib import treewalkers
+from .. import treewalkers
 
-from htmlserializer import HTMLSerializer
-from xhtmlserializer import XHTMLSerializer
+from .htmlserializer import HTMLSerializer
 
-def serialize(input, tree="simpletree", format="html", encoding=None,
+
+def serialize(input, tree="etree", format="html", encoding=None,
               **serializer_opts):
     # XXX: Should we cache this?
-    walker = treewalkers.getTreeWalker(tree) 
+    walker = treewalkers.getTreeWalker(tree)
     if format == "html":
         s = HTMLSerializer(**serializer_opts)
-    elif format == "xhtml":
-        s = XHTMLSerializer(**serializer_opts)
     else:
-        raise ValueError, "type must be either html or xhtml"
+        raise ValueError("type must be html")
     return s.render(walker(input), encoding)
diff --git a/planet/vendor/html5lib/serializer/htmlserializer.py b/planet/vendor/html5lib/serializer/htmlserializer.py
index 45f1d06..412a5a2 100644
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@@ -1,18 +1,20 @@
-try:
-    frozenset
-except NameError:
-    # Import from the sets module for python 2.3
-    from sets import ImmutableSet as frozenset
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
 
 import gettext
 _ = gettext.gettext
 
-from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
-from html5lib.constants import rcdataElements, entities, xmlEntities
-from html5lib import utils
+try:
+    from functools import reduce
+except ImportError:
+    pass
+
+from ..constants import voidElements, booleanAttributes, spaceCharacters
+from ..constants import rcdataElements, entities, xmlEntities
+from .. import utils
 from xml.sax.saxutils import escape
 
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
 
 try:
     from codecs import register_error, xmlcharrefreplace_errors
@@ -21,13 +23,21 @@
 else:
     unicode_encode_errors = "htmlentityreplace"
 
-    from html5lib.constants import entities
-
     encode_entity_map = {}
-    for k, v in entities.items():
-        if v != "&" and encode_entity_map.get(v) != k.lower():
-            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
-            encode_entity_map[ord(v)] = k
+    is_ucs4 = len("\U0010FFFF") == 1
+    for k, v in list(entities.items()):
+        # skip multi-character entities
+        if ((is_ucs4 and len(v) > 1) or
+                (not is_ucs4 and len(v) > 2)):
+            continue
+        if v != "&":
+            if len(v) == 2:
+                v = utils.surrogatePairToCodepoint(v)
+            else:
+                v = ord(v)
+            if not v in encode_entity_map or k.islower():
+                # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+                encode_entity_map[v] = k
 
     def htmlentityreplace_errors(exc):
         if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
@@ -39,8 +49,8 @@ def htmlentityreplace_errors(exc):
                     skip = False
                     continue
                 index = i + exc.start
-                if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
-                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
+                if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
                     skip = True
                 else:
                     codepoint = ord(c)
@@ -53,8 +63,8 @@ def htmlentityreplace_errors(exc):
                     if not e.endswith(";"):
                         res.append(";")
                 else:
-                    res.append("&#x%s;"%(hex(cp)[2:]))
-            return (u"".join(res), exc.end)
+                    res.append("&#x%s;" % (hex(cp)[2:]))
+            return ("".join(res), exc.end)
         else:
             return xmlcharrefreplace_errors(exc)
 
@@ -62,8 +72,6 @@ def htmlentityreplace_errors(exc):
 
     del register_error
 
-def encode(text, encoding):
-    return text.encode(encoding, unicode_encode_errors)
 
 class HTMLSerializer(object):
 
@@ -84,109 +92,165 @@ class HTMLSerializer(object):
     resolve_entities = True
 
     # miscellaneous options
+    alphabetical_attributes = False
     inject_meta_charset = True
     strip_whitespace = False
     sanitize = False
 
     options = ("quote_attr_values", "quote_char", "use_best_quote_char",
-          "minimize_boolean_attributes", "use_trailing_solidus",
-          "space_before_trailing_solidus", "omit_optional_tags",
-          "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
-          "escape_rcdata", "resolve_entities", "sanitize")
+               "omit_optional_tags", "minimize_boolean_attributes",
+               "use_trailing_solidus", "space_before_trailing_solidus",
+               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
+               "alphabetical_attributes", "inject_meta_charset",
+               "strip_whitespace", "sanitize")
 
     def __init__(self, **kwargs):
-        if kwargs.has_key('quote_char'):
+        """Initialize HTMLSerializer.
+
+        Keyword options (default given first unless specified) include:
+
+        inject_meta_charset=True|False
+          Whether it insert a meta element to define the character set of the
+          document.
+        quote_attr_values=True|False
+          Whether to quote attribute values that don't require quoting
+          per HTML5 parsing rules.
+        quote_char=u'"'|u"'"
+          Use given quote character for attribute quoting. Default is to
+          use double quote unless attribute value contains a double quote,
+          in which case single quotes are used instead.
+        escape_lt_in_attrs=False|True
+          Whether to escape < in attribute values.
+        escape_rcdata=False|True
+          Whether to escape characters that need to be escaped within normal
+          elements within rcdata elements such as style.
+        resolve_entities=True|False
+          Whether to resolve named character entities that appear in the
+          source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
+          are unaffected by this setting.
+        strip_whitespace=False|True
+          Whether to remove semantically meaningless whitespace. (This
+          compresses all whitespace to a single space except within pre.)
+        minimize_boolean_attributes=True|False
+          Shortens boolean attributes to give just the attribute value,
+          for example <input disabled="disabled"> becomes <input disabled>.
+        use_trailing_solidus=False|True
+          Includes a close-tag slash at the end of the start tag of void
+          elements (empty elements whose end tag is forbidden). E.g. <hr/>.
+        space_before_trailing_solidus=True|False
+          Places a space immediately before the closing slash in a tag
+          using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
+        sanitize=False|True
+          Strip all unsafe or unknown constructs from output.
+          See `html5lib user documentation`_
+        omit_optional_tags=True|False
+          Omit start/end tags that are optional.
+        alphabetical_attributes=False|True
+          Reorder attributes to be in alphabetical order.
+
+        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
+        """
+        if 'quote_char' in kwargs:
             self.use_best_quote_char = False
         for attr in self.options:
             setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
         self.errors = []
         self.strict = False
 
+    def encode(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, unicode_encode_errors)
+        else:
+            return string
+
+    def encodeStrict(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "strict")
+        else:
+            return string
+
     def serialize(self, treewalker, encoding=None):
+        self.encoding = encoding
         in_cdata = False
         self.errors = []
+
         if encoding and self.inject_meta_charset:
-            from html5lib.filters.inject_meta_charset import Filter
+            from ..filters.inject_meta_charset import Filter
             treewalker = Filter(treewalker, encoding)
-        # XXX: WhitespaceFilter should be used before OptionalTagFilter
+        # WhitespaceFilter should be used before OptionalTagFilter
         # for maximum efficiently of this latter filter
         if self.strip_whitespace:
-            from html5lib.filters.whitespace import Filter
+            from ..filters.whitespace import Filter
             treewalker = Filter(treewalker)
         if self.sanitize:
-            from html5lib.filters.sanitizer import Filter
+            from ..filters.sanitizer import Filter
             treewalker = Filter(treewalker)
         if self.omit_optional_tags:
-            from html5lib.filters.optionaltags import Filter
+            from ..filters.optionaltags import Filter
             treewalker = Filter(treewalker)
+        # Alphabetical attributes must be last, as other filters
+        # could add attributes and alter the order
+        if self.alphabetical_attributes:
+            from ..filters.alphabeticalattributes import Filter
+            treewalker = Filter(treewalker)
+
         for token in treewalker:
             type = token["type"]
             if type == "Doctype":
-                doctype = u"<!DOCTYPE %s" % token["name"]
-                
+                doctype = "<!DOCTYPE %s" % token["name"]
+
                 if token["publicId"]:
-                    doctype += u' PUBLIC "%s"' % token["publicId"]
+                    doctype += ' PUBLIC "%s"' % token["publicId"]
                 elif token["systemId"]:
-                    doctype += u" SYSTEM"
-                if token["systemId"]:                
-                    if token["systemId"].find(u'"') >= 0:
-                        if token["systemId"].find(u"'") >= 0:
+                    doctype += " SYSTEM"
+                if token["systemId"]:
+                    if token["systemId"].find('"') >= 0:
+                        if token["systemId"].find("'") >= 0:
                             self.serializeError(_("System identifer contains both single and double quote characters"))
-                        quote_char = u"'"
+                        quote_char = "'"
                     else:
-                        quote_char = u'"'
-                    doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
-                
-                doctype += u">"
-                
-                if encoding:
-                    yield doctype.encode(encoding)
-                else:
-                    yield doctype
+                        quote_char = '"'
+                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
+
+                doctype += ">"
+                yield self.encodeStrict(doctype)
 
             elif type in ("Characters", "SpaceCharacters"):
                 if type == "SpaceCharacters" or in_cdata:
                     if in_cdata and token["data"].find("</") >= 0:
                         self.serializeError(_("Unexpected </ in CDATA"))
-                    if encoding:
-                        yield token["data"].encode(encoding, "strict")
-                    else:
-                        yield token["data"]
-                elif encoding:
-                    yield encode(escape(token["data"]), encoding)
+                    yield self.encode(token["data"])
                 else:
-                    yield escape(token["data"])
+                    yield self.encode(escape(token["data"]))
 
             elif type in ("StartTag", "EmptyTag"):
                 name = token["name"]
+                yield self.encodeStrict("<%s" % name)
                 if name in rcdataElements and not self.escape_rcdata:
                     in_cdata = True
                 elif in_cdata:
                     self.serializeError(_("Unexpected child element of a CDATA element"))
-                attrs = token["data"]
-                if hasattr(attrs, "items"):
-                    attrs = attrs.items()
-                attrs.sort()
-                attributes = []
-                for k,v in attrs:
-                    if encoding:
-                        k = k.encode(encoding, "strict")
-                    attributes.append(' ')
-
-                    attributes.append(k)
+                for (attr_namespace, attr_name), attr_value in token["data"].items():
+                    # TODO: Add namespace support here
+                    k = attr_name
+                    v = attr_value
+                    yield self.encodeStrict(' ')
+
+                    yield self.encodeStrict(k)
                     if not self.minimize_boolean_attributes or \
-                      (k not in booleanAttributes.get(name, tuple()) \
-                      and k not in booleanAttributes.get("", tuple())):
-                        attributes.append("=")
+                        (k not in booleanAttributes.get(name, tuple())
+                         and k not in booleanAttributes.get("", tuple())):
+                        yield self.encodeStrict("=")
                         if self.quote_attr_values or not v:
                             quote_attr = True
                         else:
-                            quote_attr = reduce(lambda x,y: x or (y in v),
-                                spaceCharacters + ">\"'=", False)
+                            quote_attr = reduce(lambda x, y: x or (y in v),
+                                                spaceCharacters + ">\"'=", False)
                         v = v.replace("&", "&amp;")
-                        if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
-                        if encoding:
-                            v = encode(v, encoding)
+                        if self.escape_lt_in_attrs:
+                            v = v.replace("<", "&lt;")
                         if quote_attr:
                             quote_char = self.quote_char
                             if self.use_best_quote_char:
@@ -198,20 +262,17 @@ def serialize(self, treewalker, encoding=None):
                                 v = v.replace("'", "&#39;")
                             else:
                                 v = v.replace('"', "&quot;")
-                            attributes.append(quote_char)
-                            attributes.append(v)
-                            attributes.append(quote_char)
+                            yield self.encodeStrict(quote_char)
+                            yield self.encode(v)
+                            yield self.encodeStrict(quote_char)
                         else:
-                            attributes.append(v)
+                            yield self.encode(v)
                 if name in voidElements and self.use_trailing_solidus:
                     if self.space_before_trailing_solidus:
-                        attributes.append(" /")
+                        yield self.encodeStrict(" /")
                     else:
-                        attributes.append("/")
-                if encoding:
-                    yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
-                else:
-                    yield u"<%s%s>" % (name, u"".join(attributes))
+                        yield self.encodeStrict("/")
+                yield self.encode(">")
 
             elif type == "EndTag":
                 name = token["name"]
@@ -219,19 +280,13 @@ def serialize(self, treewalker, encoding=None):
                     in_cdata = False
                 elif in_cdata:
                     self.serializeError(_("Unexpected child element of a CDATA element"))
-                end_tag = u"</%s>" % name
-                if encoding:
-                    end_tag = end_tag.encode(encoding, "strict")
-                yield end_tag
+                yield self.encodeStrict("</%s>" % name)
 
             elif type == "Comment":
                 data = token["data"]
                 if data.find("--") >= 0:
                     self.serializeError(_("Comment contains --"))
-                comment = u"<!--%s-->" % token["data"]
-                if encoding:
-                    comment = comment.encode(encoding, unicode_encode_errors)
-                yield comment
+                yield self.encodeStrict("<!--%s-->" % token["data"])
 
             elif type == "Entity":
                 name = token["name"]
@@ -241,19 +296,17 @@ def serialize(self, treewalker, encoding=None):
                 if self.resolve_entities and key not in xmlEntities:
                     data = entities[key]
                 else:
-                    data = u"&%s;" % name
-                if encoding:
-                    data = data.encode(encoding, unicode_encode_errors)
-                yield data
+                    data = "&%s;" % name
+                yield self.encodeStrict(data)
 
             else:
                 self.serializeError(token["data"])
 
     def render(self, treewalker, encoding=None):
         if encoding:
-            return "".join(list(self.serialize(treewalker, encoding)))
+            return b"".join(list(self.serialize(treewalker, encoding)))
         else:
-            return u"".join(list(self.serialize(treewalker)))
+            return "".join(list(self.serialize(treewalker)))
 
     def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
         # XXX The idea is to make data mandatory.
@@ -261,6 +314,7 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
         if self.strict:
             raise SerializeError
 
+
 def SerializeError(Exception):
     """Error in serialized tree"""
     pass
diff --git a/planet/vendor/html5lib/serializer/xhtmlserializer.py b/planet/vendor/html5lib/serializer/xhtmlserializer.py
deleted file mode 100644
index 7fdce47..0000000
--- a/planet/vendor/html5lib/serializer/xhtmlserializer.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from htmlserializer import HTMLSerializer
-
-class XHTMLSerializer(HTMLSerializer):
-    quote_attr_values = True
-    minimize_boolean_attributes = False
-    use_trailing_solidus = True
-    escape_lt_in_attrs = True
-    omit_optional_tags = False
-    escape_rcdata = True
diff --git a/planet/vendor/html5lib/tests/README b/planet/vendor/html5lib/tests/README
new file mode 100644
index 0000000..c564b68
--- /dev/null
+++ b/planet/vendor/html5lib/tests/README
@@ -0,0 +1 @@
+Each testcase file can be run through nose (using ``nosetests``).
\ No newline at end of file
diff --git a/planet/vendor/html5lib/tests/__init__.py b/planet/vendor/html5lib/tests/__init__.py
new file mode 100644
index 0000000..b8ce2de
--- /dev/null
+++ b/planet/vendor/html5lib/tests/__init__.py
@@ -0,0 +1 @@
+from __future__ import absolute_import, division, unicode_literals
diff --git a/planet/vendor/html5lib/tests/mockParser.py b/planet/vendor/html5lib/tests/mockParser.py
new file mode 100644
index 0000000..ef31527
--- /dev/null
+++ b/planet/vendor/html5lib/tests/mockParser.py
@@ -0,0 +1,41 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+import os
+
+if __name__ == '__main__':
+    # Allow us to import from the src directory
+    os.chdir(os.path.split(os.path.abspath(__file__))[0])
+    sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
+
+from html5lib.tokenizer import HTMLTokenizer
+
+
+class HTMLParser(object):
+    """ Fake parser to test tokenizer output """
+    def parse(self, stream, output=True):
+        tokenizer = HTMLTokenizer(stream)
+        for token in tokenizer:
+            if output:
+                print(token)
+
+if __name__ == "__main__":
+    x = HTMLParser()
+    if len(sys.argv) > 1:
+        if len(sys.argv) > 2:
+            import hotshot
+            import hotshot.stats
+            prof = hotshot.Profile('stats.prof')
+            prof.runcall(x.parse, sys.argv[1], False)
+            prof.close()
+            stats = hotshot.stats.load('stats.prof')
+            stats.strip_dirs()
+            stats.sort_stats('time')
+            stats.print_stats()
+        else:
+            x.parse(sys.argv[1])
+    else:
+        print("""Usage: python mockParser.py filename [stats]
+        If stats is specified the hotshots profiler will run and output the
+        stats instead.
+        """)
diff --git a/planet/vendor/html5lib/tests/performance/concatenation.py b/planet/vendor/html5lib/tests/performance/concatenation.py
new file mode 100755
index 0000000..a146503
--- /dev/null
+++ b/planet/vendor/html5lib/tests/performance/concatenation.py
@@ -0,0 +1,36 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+def f1():
+    x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    x += y + z
+
+
+def f2():
+    x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    x = x + y + z
+
+
+def f3():
+    x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    x = "".join((x, y, z))
+
+
+def f4():
+    x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    x = "%s%s%s" % (x, y, z)
+
+import timeit
+for x in range(4):
+    statement = "f%s" % (x + 1)
+    t = timeit.Timer(statement, "from __main__ import " + statement)
+    r = t.repeat(3, 1000000)
+    print(r, min(r))
diff --git a/planet/vendor/html5lib/tests/runparsertests.py b/planet/vendor/html5lib/tests/runparsertests.py
new file mode 100644
index 0000000..e671f8d
--- /dev/null
+++ b/planet/vendor/html5lib/tests/runparsertests.py
@@ -0,0 +1,27 @@
+import sys
+import os
+import glob
+import unittest
+
+#Allow us to import the parent module
+os.chdir(os.path.split(os.path.abspath(__file__))[0])
+sys.path.insert(0, os.path.abspath(os.curdir))
+sys.path.insert(0, os.path.abspath(os.pardir))
+sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
+
+def buildTestSuite():
+    suite = unittest.TestSuite()
+    for testcase in glob.glob('test_*.py'):
+        if testcase in ("test_tokenizer.py", "test_parser.py", "test_parser2.py"):
+            module = os.path.splitext(testcase)[0]
+            suite.addTest(__import__(module).buildTestSuite())
+    return suite
+
+def main():
+    results = unittest.TextTestRunner().run(buildTestSuite())
+    return results
+
+if __name__ == "__main__":
+    results = main()
+    if not results.wasSuccessful():
+        sys.exit(1)
diff --git a/planet/vendor/html5lib/tests/runtests.py b/planet/vendor/html5lib/tests/runtests.py
new file mode 100644
index 0000000..b8e3572
--- /dev/null
+++ b/planet/vendor/html5lib/tests/runtests.py
@@ -0,0 +1,20 @@
+import sys
+import os
+import glob
+import unittest
+
+def buildTestSuite():
+    suite = unittest.TestSuite()
+    for testcase in glob.glob('test_*.py'):
+        module = os.path.splitext(testcase)[0]
+        suite.addTest(__import__(module).buildTestSuite())
+    return suite
+
+def main():
+    results = unittest.TextTestRunner().run(buildTestSuite())
+    return results
+
+if __name__ == "__main__":
+    results = main()
+    if not results.wasSuccessful():
+        sys.exit(1)
diff --git a/planet/vendor/html5lib/tests/support.py b/planet/vendor/html5lib/tests/support.py
new file mode 100644
index 0000000..41f2d2a
--- /dev/null
+++ b/planet/vendor/html5lib/tests/support.py
@@ -0,0 +1,177 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import sys
+import codecs
+import glob
+import xml.sax.handler
+
+base_path = os.path.split(__file__)[0]
+
+test_dir = os.path.join(base_path, 'testdata')
+sys.path.insert(0, os.path.abspath(os.path.join(base_path,
+                                                os.path.pardir,
+                                                os.path.pardir)))
+
+from html5lib import treebuilders
+del base_path
+
+# Build a dict of avaliable trees
+treeTypes = {"DOM": treebuilders.getTreeBuilder("dom")}
+
+# Try whatever etree implementations are avaliable from a list that are
+#"supposed" to work
+try:
+    import xml.etree.ElementTree as ElementTree
+    treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
+except ImportError:
+    try:
+        import elementtree.ElementTree as ElementTree
+        treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
+    except ImportError:
+        pass
+
+try:
+    import xml.etree.cElementTree as cElementTree
+    treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
+except ImportError:
+    try:
+        import cElementTree
+        treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
+    except ImportError:
+        pass
+
+try:
+    import lxml.etree as lxml  # flake8: noqa
+except ImportError:
+    pass
+else:
+    treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml")
+
+
+def get_data_files(subdirectory, files='*.dat'):
+    return glob.glob(os.path.join(test_dir, subdirectory, files))
+
+
+class DefaultDict(dict):
+    def __init__(self, default, *args, **kwargs):
+        self.default = default
+        dict.__init__(self, *args, **kwargs)
+
+    def __getitem__(self, key):
+        return dict.get(self, key, self.default)
+
+
+class TestData(object):
+    def __init__(self, filename, newTestHeading="data", encoding="utf8"):
+        if encoding is None:
+            self.f = open(filename, mode="rb")
+        else:
+            self.f = codecs.open(filename, encoding=encoding)
+        self.encoding = encoding
+        self.newTestHeading = newTestHeading
+
+    def __del__(self):
+        self.f.close()
+
+    def __iter__(self):
+        data = DefaultDict(None)
+        key = None
+        for line in self.f:
+            heading = self.isSectionHeading(line)
+            if heading:
+                if data and heading == self.newTestHeading:
+                    # Remove trailing newline
+                    data[key] = data[key][:-1]
+                    yield self.normaliseOutput(data)
+                    data = DefaultDict(None)
+                key = heading
+                data[key] = "" if self.encoding else b""
+            elif key is not None:
+                data[key] += line
+        if data:
+            yield self.normaliseOutput(data)
+
+    def isSectionHeading(self, line):
+        """If the current heading is a test section heading return the heading,
+        otherwise return False"""
+        # print(line)
+        if line.startswith("#" if self.encoding else b"#"):
+            return line[1:].strip()
+        else:
+            return False
+
+    def normaliseOutput(self, data):
+        # Remove trailing newlines
+        for key, value in data.items():
+            if value.endswith("\n" if self.encoding else b"\n"):
+                data[key] = value[:-1]
+        return data
+
+
+def convert(stripChars):
+    def convertData(data):
+        """convert the output of str(document) to the format used in the testcases"""
+        data = data.split("\n")
+        rv = []
+        for line in data:
+            if line.startswith("|"):
+                rv.append(line[stripChars:])
+            else:
+                rv.append(line)
+        return "\n".join(rv)
+    return convertData
+
+convertExpected = convert(2)
+
+
+def errorMessage(input, expected, actual):
+    msg = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n" %
+           (repr(input), repr(expected), repr(actual)))
+    if sys.version_info.major == 2:
+        msg = msg.encode("ascii", "backslashreplace")
+    return msg
+
+
+class TracingSaxHandler(xml.sax.handler.ContentHandler):
+    def __init__(self):
+        xml.sax.handler.ContentHandler.__init__(self)
+        self.visited = []
+
+    def startDocument(self):
+        self.visited.append('startDocument')
+
+    def endDocument(self):
+        self.visited.append('endDocument')
+
+    def startPrefixMapping(self, prefix, uri):
+        # These are ignored as their order is not guaranteed
+        pass
+
+    def endPrefixMapping(self, prefix):
+        # These are ignored as their order is not guaranteed
+        pass
+
+    def startElement(self, name, attrs):
+        self.visited.append(('startElement', name, attrs))
+
+    def endElement(self, name):
+        self.visited.append(('endElement', name))
+
+    def startElementNS(self, name, qname, attrs):
+        self.visited.append(('startElementNS', name, qname, dict(attrs)))
+
+    def endElementNS(self, name, qname):
+        self.visited.append(('endElementNS', name, qname))
+
+    def characters(self, content):
+        self.visited.append(('characters', content))
+
+    def ignorableWhitespace(self, whitespace):
+        self.visited.append(('ignorableWhitespace', whitespace))
+
+    def processingInstruction(self, target, data):
+        self.visited.append(('processingInstruction', target, data))
+
+    def skippedEntity(self, name):
+        self.visited.append(('skippedEntity', name))
diff --git a/planet/vendor/html5lib/tests/test_encoding.py b/planet/vendor/html5lib/tests/test_encoding.py
new file mode 100644
index 0000000..d774ce0
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_encoding.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import unittest
+
+try:
+    unittest.TestCase.assertEqual
+except AttributeError:
+    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
+
+from .support import get_data_files, TestData, test_dir, errorMessage
+from html5lib import HTMLParser, inputstream
+
+
+class Html5EncodingTestCase(unittest.TestCase):
+    def test_codec_name_a(self):
+        self.assertEqual(inputstream.codecName("utf-8"), "utf-8")
+
+    def test_codec_name_b(self):
+        self.assertEqual(inputstream.codecName("utf8"), "utf-8")
+
+    def test_codec_name_c(self):
+        self.assertEqual(inputstream.codecName("  utf8  "), "utf-8")
+
+    def test_codec_name_d(self):
+        self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
+
+
+def runParserEncodingTest(data, encoding):
+    p = HTMLParser()
+    assert p.documentEncoding is None
+    p.parse(data, useChardet=False)
+    encoding = encoding.lower().decode("ascii")
+
+    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
+
+
+def runPreScanEncodingTest(data, encoding):
+    stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
+    encoding = encoding.lower().decode("ascii")
+
+    # Very crude way to ignore irrelevant tests
+    if len(data) > stream.numBytesMeta:
+        return
+
+    assert encoding == stream.charEncoding[0], errorMessage(data, encoding, stream.charEncoding[0])
+
+
+def test_encoding():
+    for filename in get_data_files("encoding"):
+        tests = TestData(filename, b"data", encoding=None)
+        for idx, test in enumerate(tests):
+            yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
+            yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
+
+try:
+    try:
+        import charade  # flake8: noqa
+    except ImportError:
+        import chardet  # flake8: noqa
+except ImportError:
+    print("charade/chardet not found, skipping chardet tests")
+else:
+    def test_chardet():
+        with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp:
+            encoding = inputstream.HTMLInputStream(fp.read()).charEncoding
+            assert encoding[0].lower() == "big5"
diff --git a/planet/vendor/html5lib/tests/test_parser.py b/planet/vendor/html5lib/tests/test_parser.py
new file mode 100644
index 0000000..230cdb4
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_parser.py
@@ -0,0 +1,96 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import sys
+import traceback
+import warnings
+import re
+
+warnings.simplefilter("error")
+
+from .support import get_data_files
+from .support import TestData, convert, convertExpected, treeTypes
+from html5lib import html5parser, constants
+
+# Run the parse error checks
+checkParseErrors = False
+
+# XXX - There should just be one function here but for some reason the testcase
+# format differs from the treedump format by a single space character
+
+
+def convertTreeDump(data):
+    return "\n".join(convert(3)(data).split("\n")[1:])
+
+namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
+
+
+def runParserTest(innerHTML, input, expected, errors, treeClass,
+                  namespaceHTMLElements):
+    with warnings.catch_warnings(record=True) as caughtWarnings:
+        warnings.simplefilter("always")
+        p = html5parser.HTMLParser(tree=treeClass,
+                                   namespaceHTMLElements=namespaceHTMLElements)
+
+        try:
+            if innerHTML:
+                document = p.parseFragment(input, innerHTML)
+            else:
+                document = p.parse(input)
+        except:
+            errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
+                                  "\nTraceback:", traceback.format_exc()])
+            assert False, errorMsg
+
+    otherWarnings = [x for x in caughtWarnings
+                     if not issubclass(x.category, constants.DataLossWarning)]
+    assert len(otherWarnings) == 0, [(x.category, x.message) for x in otherWarnings]
+    if len(caughtWarnings):
+        return
+
+    output = convertTreeDump(p.tree.testSerializer(document))
+
+    expected = convertExpected(expected)
+    if namespaceHTMLElements:
+        expected = namespaceExpected(r"\1<html \2>", expected)
+
+    errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
+                          "\nReceived:", output])
+    assert expected == output, errorMsg
+
+    errStr = []
+    for (line, col), errorcode, datavars in p.errors:
+        assert isinstance(datavars, dict), "%s, %s" % (errorcode, repr(datavars))
+        errStr.append("Line: %i Col: %i %s" % (line, col,
+                                               constants.E[errorcode] % datavars))
+
+    errorMsg2 = "\n".join(["\n\nInput:", input,
+                           "\nExpected errors (" + str(len(errors)) + "):\n" + "\n".join(errors),
+                           "\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr)])
+    if checkParseErrors:
+            assert len(p.errors) == len(errors), errorMsg2
+
+
+def test_parser():
+    sys.stderr.write('Testing tree builders ' + " ".join(list(treeTypes.keys())) + "\n")
+    files = get_data_files('tree-construction')
+
+    for filename in files:
+        testName = os.path.basename(filename).replace(".dat", "")
+        if testName in ("template",):
+            continue
+
+        tests = TestData(filename, "data")
+
+        for index, test in enumerate(tests):
+            input, errors, innerHTML, expected = [test[key] for key in
+                                                  ('data', 'errors',
+                                                   'document-fragment',
+                                                   'document')]
+            if errors:
+                errors = errors.split("\n")
+
+            for treeName, treeCls in treeTypes.items():
+                for namespaceHTMLElements in (True, False):
+                    yield (runParserTest, innerHTML, input, expected, errors, treeCls,
+                           namespaceHTMLElements)
diff --git a/planet/vendor/html5lib/tests/test_parser2.py b/planet/vendor/html5lib/tests/test_parser2.py
new file mode 100755
index 0000000..20bbdf3
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_parser2.py
@@ -0,0 +1,64 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import io
+
+from . import support  # flake8: noqa
+from html5lib import html5parser
+from html5lib.constants import namespaces
+from html5lib import treebuilders
+
+import unittest
+
+# tests that aren't autogenerated from text files
+
+
+class MoreParserTests(unittest.TestCase):
+
+    def setUp(self):
+        self.dom_tree = treebuilders.getTreeBuilder("dom")
+
+    def test_assertDoctypeCloneable(self):
+        parser = html5parser.HTMLParser(tree=self.dom_tree)
+        doc = parser.parse('<!DOCTYPE HTML>')
+        self.assertTrue(doc.cloneNode(True))
+
+    def test_line_counter(self):
+        # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
+        parser = html5parser.HTMLParser(tree=self.dom_tree)
+        parser.parse("<pre>\nx\n&gt;\n</pre>")
+
+    def test_namespace_html_elements_0_dom(self):
+        parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=True)
+        doc = parser.parse("<html></html>")
+        self.assertTrue(doc.childNodes[0].namespaceURI == namespaces["html"])
+
+    def test_namespace_html_elements_1_dom(self):
+        parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=False)
+        doc = parser.parse("<html></html>")
+        self.assertTrue(doc.childNodes[0].namespaceURI is None)
+
+    def test_namespace_html_elements_0_etree(self):
+        parser = html5parser.HTMLParser(namespaceHTMLElements=True)
+        doc = parser.parse("<html></html>")
+        self.assertTrue(list(doc)[0].tag == "{%s}html" % (namespaces["html"],))
+
+    def test_namespace_html_elements_1_etree(self):
+        parser = html5parser.HTMLParser(namespaceHTMLElements=False)
+        doc = parser.parse("<html></html>")
+        self.assertTrue(list(doc)[0].tag == "html")
+
+    def test_unicode_file(self):
+        parser = html5parser.HTMLParser()
+        parser.parse(io.StringIO("a"))
+
+
+def buildTestSuite():
+    return unittest.defaultTestLoader.loadTestsFromName(__name__)
+
+
+def main():
+    buildTestSuite()
+    unittest.main()
+
+if __name__ == '__main__':
+    main()
diff --git a/planet/vendor/html5lib/tests/test_sanitizer.py b/planet/vendor/html5lib/tests/test_sanitizer.py
new file mode 100644
index 0000000..1cc687d
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_sanitizer.py
@@ -0,0 +1,105 @@
+from __future__ import absolute_import, division, unicode_literals
+
+try:
+    import json
+except ImportError:
+    import simplejson as json
+
+from html5lib import html5parser, sanitizer, constants, treebuilders
+
+
+def toxmlFactory():
+    tree = treebuilders.getTreeBuilder("etree")
+
+    def toxml(element):
+        # encode/decode roundtrip required for Python 2.6 compatibility
+        result_bytes = tree.implementation.tostring(element, encoding="utf-8")
+        return result_bytes.decode("utf-8")
+
+    return toxml
+
+
+def runSanitizerTest(name, expected, input, toxml=None):
+    if toxml is None:
+        toxml = toxmlFactory()
+    expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
+                        parseFragment(expected)])
+    expected = json.loads(json.dumps(expected))
+    assert expected == sanitize_html(input)
+
+
+def sanitize_html(stream, toxml=None):
+    if toxml is None:
+        toxml = toxmlFactory()
+    return ''.join([toxml(token) for token in
+                    html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
+                    parseFragment(stream)])
+
+
+def test_should_handle_astral_plane_characters():
+    assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
+
+
+def test_sanitizer():
+    toxml = toxmlFactory()
+    for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
+        if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
+            continue  # TODO
+        if tag_name != tag_name.lower():
+            continue  # TODO
+        if tag_name == 'image':
+            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+                   "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
+                   toxml)
+        elif tag_name == 'br':
+            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+                   "<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
+                   toxml)
+        elif tag_name in constants.voidElements:
+            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+                   "<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
+                   toxml)
+        else:
+            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+                   "<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
+                   toxml)
+
+    for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
+        tag_name = tag_name.upper()
+        yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
+               "&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name, tag_name),
+               "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
+               toxml)
+
+    for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
+        if attribute_name != attribute_name.lower():
+            continue  # TODO
+        if attribute_name == 'style':
+            continue
+        yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
+               "<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
+               "<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
+               toxml)
+
+    for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
+        attribute_name = attribute_name.upper()
+        yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
+               "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
+               "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
+               toxml)
+
+    for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
+        yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
+               "<a href=\"%s\">foo</a>" % protocol,
+               """<a href="%s">foo</a>""" % protocol,
+               toxml)
+
+    for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
+        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
+               "<a href=\"%s\">foo</a>" % protocol,
+               """<a href="%s">foo</a>""" % protocol,
+               toxml)
diff --git a/planet/vendor/html5lib/tests/test_serializer.py b/planet/vendor/html5lib/tests/test_serializer.py
new file mode 100644
index 0000000..3c37fef
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_serializer.py
@@ -0,0 +1,178 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import json
+import unittest
+
+from .support import get_data_files
+
+try:
+    unittest.TestCase.assertEqual
+except AttributeError:
+    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
+
+import html5lib
+from html5lib import constants
+from html5lib.serializer import HTMLSerializer, serialize
+from html5lib.treewalkers._base import TreeWalker
+
+optionals_loaded = []
+
+try:
+    from lxml import etree
+    optionals_loaded.append("lxml")
+except ImportError:
+    pass
+
+default_namespace = constants.namespaces["html"]
+
+
+class JsonWalker(TreeWalker):
+    def __iter__(self):
+        for token in self.tree:
+            type = token[0]
+            if type == "StartTag":
+                if len(token) == 4:
+                    namespace, name, attrib = token[1:4]
+                else:
+                    namespace = default_namespace
+                    name, attrib = token[1:3]
+                yield self.startTag(namespace, name, self._convertAttrib(attrib))
+            elif type == "EndTag":
+                if len(token) == 3:
+                    namespace, name = token[1:3]
+                else:
+                    namespace = default_namespace
+                    name = token[1]
+                yield self.endTag(namespace, name)
+            elif type == "EmptyTag":
+                if len(token) == 4:
+                    namespace, name, attrib = token[1:]
+                else:
+                    namespace = default_namespace
+                    name, attrib = token[1:]
+                for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
+                    yield token
+            elif type == "Comment":
+                yield self.comment(token[1])
+            elif type in ("Characters", "SpaceCharacters"):
+                for token in self.text(token[1]):
+                    yield token
+            elif type == "Doctype":
+                if len(token) == 4:
+                    yield self.doctype(token[1], token[2], token[3])
+                elif len(token) == 3:
+                    yield self.doctype(token[1], token[2])
+                else:
+                    yield self.doctype(token[1])
+            else:
+                raise ValueError("Unknown token type: " + type)
+
+    def _convertAttrib(self, attribs):
+        """html5lib tree-walkers use a dict of (namespace, name): value for
+        attributes, but JSON cannot represent this. Convert from the format
+        in the serializer tests (a list of dicts with "namespace", "name",
+        and "value" as keys) to html5lib's tree-walker format."""
+        attrs = {}
+        for attrib in attribs:
+            name = (attrib["namespace"], attrib["name"])
+            assert(name not in attrs)
+            attrs[name] = attrib["value"]
+        return attrs
+
+
+def serialize_html(input, options):
+    options = dict([(str(k), v) for k, v in options.items()])
+    stream = JsonWalker(input)
+    serializer = HTMLSerializer(alphabetical_attributes=True, **options)
+    return serializer.render(stream, options.get("encoding", None))
+
+
+def runSerializerTest(input, expected, options):
+    encoding = options.get("encoding", None)
+
+    if encoding:
+        encode = lambda x: x.encode(encoding)
+        expected = list(map(encode, expected))
+
+    result = serialize_html(input, options)
+    if len(expected) == 1:
+        assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
+    elif result not in expected:
+        assert False, "Expected: %s, Received: %s" % (expected, result)
+
+
+class EncodingTestCase(unittest.TestCase):
+    def throwsWithLatin1(self, input):
+        self.assertRaises(UnicodeEncodeError, serialize_html, input, {"encoding": "iso-8859-1"})
+
+    def testDoctypeName(self):
+        self.throwsWithLatin1([["Doctype", "\u0101"]])
+
+    def testDoctypePublicId(self):
+        self.throwsWithLatin1([["Doctype", "potato", "\u0101"]])
+
+    def testDoctypeSystemId(self):
+        self.throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]])
+
+    def testCdataCharacters(self):
+        runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
+                          ["<style>&amacr;"], {"encoding": "iso-8859-1"})
+
+    def testCharacters(self):
+        runSerializerTest([["Characters", "\u0101"]],
+                          ["&amacr;"], {"encoding": "iso-8859-1"})
+
+    def testStartTagName(self):
+        self.throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])
+
+    def testEmptyTagName(self):
+        self.throwsWithLatin1([["EmptyTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])
+
+    def testAttributeName(self):
+        self.throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]])
+
+    def testAttributeValue(self):
+        runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span",
+                            [{"namespace": None, "name": "potato", "value": "\u0101"}]]],
+                          ["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})
+
+    def testEndTagName(self):
+        self.throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]])
+
+    def testComment(self):
+        self.throwsWithLatin1([["Comment", "\u0101"]])
+
+
+if "lxml" in optionals_loaded:
+    class LxmlTestCase(unittest.TestCase):
+        def setUp(self):
+            self.parser = etree.XMLParser(resolve_entities=False)
+            self.treewalker = html5lib.getTreeWalker("lxml")
+            self.serializer = HTMLSerializer()
+
+        def testEntityReplacement(self):
+            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
+            tree = etree.fromstring(doc, parser=self.parser).getroottree()
+            result = serialize(tree, tree="lxml", omit_optional_tags=False)
+            self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
+
+        def testEntityXML(self):
+            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
+            tree = etree.fromstring(doc, parser=self.parser).getroottree()
+            result = serialize(tree, tree="lxml", omit_optional_tags=False)
+            self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
+
+        def testEntityNoResolve(self):
+            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
+            tree = etree.fromstring(doc, parser=self.parser).getroottree()
+            result = serialize(tree, tree="lxml", omit_optional_tags=False,
+                                          resolve_entities=False)
+            self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
+
+
+def test_serializer():
+    for filename in get_data_files('serializer', '*.test'):
+        with open(filename) as fp:
+            tests = json.load(fp)
+            for index, test in enumerate(tests['tests']):
+                yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
diff --git a/planet/vendor/html5lib/tests/test_stream.py b/planet/vendor/html5lib/tests/test_stream.py
new file mode 100755
index 0000000..2a876c1
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_stream.py
@@ -0,0 +1,183 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import support  # flake8: noqa
+import unittest
+import codecs
+from io import BytesIO
+
+from six.moves import http_client
+
+from html5lib.inputstream import (BufferedStream, HTMLInputStream,
+                                  HTMLUnicodeInputStream, HTMLBinaryInputStream)
+
+class BufferedStreamTest(unittest.TestCase):
+    def test_basic(self):
+        s = b"abc"
+        fp = BufferedStream(BytesIO(s))
+        read = fp.read(10)
+        assert read == s
+
+    def test_read_length(self):
+        fp = BufferedStream(BytesIO(b"abcdef"))
+        read1 = fp.read(1)
+        assert read1 == b"a"
+        read2 = fp.read(2)
+        assert read2 == b"bc"
+        read3 = fp.read(3)
+        assert read3 == b"def"
+        read4 = fp.read(4)
+        assert read4 == b""
+
+    def test_tell(self):
+        fp = BufferedStream(BytesIO(b"abcdef"))
+        read1 = fp.read(1)
+        assert fp.tell() == 1
+        read2 = fp.read(2)
+        assert fp.tell() == 3
+        read3 = fp.read(3)
+        assert fp.tell() == 6
+        read4 = fp.read(4)
+        assert fp.tell() == 6
+
+    def test_seek(self):
+        fp = BufferedStream(BytesIO(b"abcdef"))
+        read1 = fp.read(1)
+        assert read1 == b"a"
+        fp.seek(0)
+        read2 = fp.read(1)
+        assert read2 == b"a"
+        read3 = fp.read(2)
+        assert read3 == b"bc"
+        fp.seek(2)
+        read4 = fp.read(2)
+        assert read4 == b"cd"
+        fp.seek(4)
+        read5 = fp.read(2)
+        assert read5 == b"ef"
+
+    def test_seek_tell(self):
+        fp = BufferedStream(BytesIO(b"abcdef"))
+        read1 = fp.read(1)
+        assert fp.tell() == 1
+        fp.seek(0)
+        read2 = fp.read(1)
+        assert fp.tell() == 1
+        read3 = fp.read(2)
+        assert fp.tell() == 3
+        fp.seek(2)
+        read4 = fp.read(2)
+        assert fp.tell() == 4
+        fp.seek(4)
+        read5 = fp.read(2)
+        assert fp.tell() == 6
+
+
+class HTMLUnicodeInputStreamShortChunk(HTMLUnicodeInputStream):
+    _defaultChunkSize = 2
+
+
+class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
+    _defaultChunkSize = 2
+
+
+class HTMLInputStreamTest(unittest.TestCase):
+
+    def test_char_ascii(self):
+        stream = HTMLInputStream(b"'", encoding='ascii')
+        self.assertEqual(stream.charEncoding[0], 'ascii')
+        self.assertEqual(stream.char(), "'")
+
+    def test_char_utf8(self):
+        stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8')
+        self.assertEqual(stream.charEncoding[0], 'utf-8')
+        self.assertEqual(stream.char(), '\u2018')
+
+    def test_char_win1252(self):
+        stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252'))
+        self.assertEqual(stream.charEncoding[0], 'windows-1252')
+        self.assertEqual(stream.char(), "\xa9")
+        self.assertEqual(stream.char(), "\xf1")
+        self.assertEqual(stream.char(), "\u2019")
+
+    def test_bom(self):
+        stream = HTMLInputStream(codecs.BOM_UTF8 + b"'")
+        self.assertEqual(stream.charEncoding[0], 'utf-8')
+        self.assertEqual(stream.char(), "'")
+
+    def test_utf_16(self):
+        stream = HTMLInputStream((' ' * 1025).encode('utf-16'))
+        self.assertTrue(stream.charEncoding[0] in ['utf-16-le', 'utf-16-be'], stream.charEncoding)
+        self.assertEqual(len(stream.charsUntil(' ', True)), 1025)
+
+    def test_newlines(self):
+        stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe")
+        self.assertEqual(stream.position(), (1, 0))
+        self.assertEqual(stream.charsUntil('c'), "a\nbb\n")
+        self.assertEqual(stream.position(), (3, 0))
+        self.assertEqual(stream.charsUntil('x'), "ccc\ndddd")
+        self.assertEqual(stream.position(), (4, 4))
+        self.assertEqual(stream.charsUntil('e'), "x")
+        self.assertEqual(stream.position(), (4, 5))
+
+    def test_newlines2(self):
+        size = HTMLUnicodeInputStream._defaultChunkSize
+        stream = HTMLInputStream("\r" * size + "\n")
+        self.assertEqual(stream.charsUntil('x'), "\n" * size)
+
+    def test_position(self):
+        stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh")
+        self.assertEqual(stream.position(), (1, 0))
+        self.assertEqual(stream.charsUntil('c'), "a\nbb\n")
+        self.assertEqual(stream.position(), (3, 0))
+        stream.unget("\n")
+        self.assertEqual(stream.position(), (2, 2))
+        self.assertEqual(stream.charsUntil('c'), "\n")
+        self.assertEqual(stream.position(), (3, 0))
+        stream.unget("\n")
+        self.assertEqual(stream.position(), (2, 2))
+        self.assertEqual(stream.char(), "\n")
+        self.assertEqual(stream.position(), (3, 0))
+        self.assertEqual(stream.charsUntil('e'), "ccc\nddd")
+        self.assertEqual(stream.position(), (4, 3))
+        self.assertEqual(stream.charsUntil('h'), "e\nf\ng")
+        self.assertEqual(stream.position(), (6, 1))
+
+    def test_position2(self):
+        stream = HTMLUnicodeInputStreamShortChunk("abc\nd")
+        self.assertEqual(stream.position(), (1, 0))
+        self.assertEqual(stream.char(), "a")
+        self.assertEqual(stream.position(), (1, 1))
+        self.assertEqual(stream.char(), "b")
+        self.assertEqual(stream.position(), (1, 2))
+        self.assertEqual(stream.char(), "c")
+        self.assertEqual(stream.position(), (1, 3))
+        self.assertEqual(stream.char(), "\n")
+        self.assertEqual(stream.position(), (2, 0))
+        self.assertEqual(stream.char(), "d")
+        self.assertEqual(stream.position(), (2, 1))
+
+    def test_python_issue_20007(self):
+        """
+        Make sure we have a work-around for Python bug #20007
+        http://bugs.python.org/issue20007
+        """
+        class FakeSocket(object):
+            def makefile(self, _mode, _bufsize=None):
+                return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
+
+        source = http_client.HTTPResponse(FakeSocket())
+        source.begin()
+        stream = HTMLInputStream(source)
+        self.assertEqual(stream.charsUntil(" "), "Text")
+
+
+def buildTestSuite():
+    return unittest.defaultTestLoader.loadTestsFromName(__name__)
+
+
+def main():
+    buildTestSuite()
+    unittest.main()
+
+if __name__ == '__main__':
+    main()
diff --git a/planet/vendor/html5lib/tests/test_tokenizer.py b/planet/vendor/html5lib/tests/test_tokenizer.py
new file mode 100644
index 0000000..7d7b525
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_tokenizer.py
@@ -0,0 +1,188 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import json
+import warnings
+import re
+
+from .support import get_data_files
+
+from html5lib.tokenizer import HTMLTokenizer
+from html5lib import constants
+
+
+class TokenizerTestParser(object):
+    def __init__(self, initialState, lastStartTag=None):
+        self.tokenizer = HTMLTokenizer
+        self._state = initialState
+        self._lastStartTag = lastStartTag
+
+    def parse(self, stream, encoding=None, innerHTML=False):
+        tokenizer = self.tokenizer(stream, encoding)
+        self.outputTokens = []
+
+        tokenizer.state = getattr(tokenizer, self._state)
+        if self._lastStartTag is not None:
+            tokenizer.currentToken = {"type": "startTag",
+                                      "name": self._lastStartTag}
+
+        types = dict((v, k) for k, v in constants.tokenTypes.items())
+        for token in tokenizer:
+            getattr(self, 'process%s' % types[token["type"]])(token)
+
+        return self.outputTokens
+
+    def processDoctype(self, token):
+        self.outputTokens.append(["DOCTYPE", token["name"], token["publicId"],
+                                  token["systemId"], token["correct"]])
+
+    def processStartTag(self, token):
+        self.outputTokens.append(["StartTag", token["name"],
+                                  dict(token["data"][::-1]), token["selfClosing"]])
+
+    def processEmptyTag(self, token):
+        if token["name"] not in constants.voidElements:
+            self.outputTokens.append("ParseError")
+        self.outputTokens.append(["StartTag", token["name"], dict(token["data"][::-1])])
+
+    def processEndTag(self, token):
+        self.outputTokens.append(["EndTag", token["name"],
+                                  token["selfClosing"]])
+
+    def processComment(self, token):
+        self.outputTokens.append(["Comment", token["data"]])
+
+    def processSpaceCharacters(self, token):
+        self.outputTokens.append(["Character", token["data"]])
+        self.processSpaceCharacters = self.processCharacters
+
+    def processCharacters(self, token):
+        self.outputTokens.append(["Character", token["data"]])
+
+    def processEOF(self, token):
+        pass
+
+    def processParseError(self, token):
+        self.outputTokens.append(["ParseError", token["data"]])
+
+
+def concatenateCharacterTokens(tokens):
+    outputTokens = []
+    for token in tokens:
+        if not "ParseError" in token and token[0] == "Character":
+            if (outputTokens and not "ParseError" in outputTokens[-1] and
+                    outputTokens[-1][0] == "Character"):
+                outputTokens[-1][1] += token[1]
+            else:
+                outputTokens.append(token)
+        else:
+            outputTokens.append(token)
+    return outputTokens
+
+
+def normalizeTokens(tokens):
+    # TODO: convert tests to reflect arrays
+    for i, token in enumerate(tokens):
+        if token[0] == 'ParseError':
+            tokens[i] = token[0]
+    return tokens
+
+
+def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
+                ignoreErrors=False):
+    """Test whether the test has passed or failed
+
+    If the ignoreErrorOrder flag is set to true we don't test the relative
+    positions of parse errors and non parse errors
+    """
+    checkSelfClosing = False
+    for token in expectedTokens:
+        if (token[0] == "StartTag" and len(token) == 4
+                or token[0] == "EndTag" and len(token) == 3):
+            checkSelfClosing = True
+            break
+
+    if not checkSelfClosing:
+        for token in receivedTokens:
+            if token[0] == "StartTag" or token[0] == "EndTag":
+                token.pop()
+
+    if not ignoreErrorOrder and not ignoreErrors:
+        return expectedTokens == receivedTokens
+    else:
+        # Sort the tokens into two groups; non-parse errors and parse errors
+        tokens = {"expected": [[], []], "received": [[], []]}
+        for tokenType, tokenList in zip(list(tokens.keys()),
+                                       (expectedTokens, receivedTokens)):
+            for token in tokenList:
+                if token != "ParseError":
+                    tokens[tokenType][0].append(token)
+                else:
+                    if not ignoreErrors:
+                        tokens[tokenType][1].append(token)
+        return tokens["expected"] == tokens["received"]
+
+
+def unescape(test):
+    def decode(inp):
+        return inp.encode("utf-8").decode("unicode-escape")
+
+    test["input"] = decode(test["input"])
+    for token in test["output"]:
+        if token == "ParseError":
+            continue
+        else:
+            token[1] = decode(token[1])
+            if len(token) > 2:
+                for key, value in token[2]:
+                    del token[2][key]
+                    token[2][decode(key)] = decode(value)
+    return test
+
+
+def runTokenizerTest(test):
+    warnings.resetwarnings()
+    warnings.simplefilter("error")
+
+    expected = concatenateCharacterTokens(test['output'])
+    if 'lastStartTag' not in test:
+        test['lastStartTag'] = None
+    parser = TokenizerTestParser(test['initialState'],
+                                 test['lastStartTag'])
+    tokens = parser.parse(test['input'])
+    tokens = concatenateCharacterTokens(tokens)
+    received = normalizeTokens(tokens)
+    errorMsg = "\n".join(["\n\nInitial state:",
+                          test['initialState'],
+                          "\nInput:", test['input'],
+                          "\nExpected:", repr(expected),
+                          "\nreceived:", repr(tokens)])
+    errorMsg = errorMsg
+    ignoreErrorOrder = test.get('ignoreErrorOrder', False)
+    assert tokensMatch(expected, received, ignoreErrorOrder, True), errorMsg
+
+
+def _doCapitalize(match):
+    return match.group(1).upper()
+
+_capitalizeRe = re.compile(r"\W+(\w)").sub
+
+
+def capitalize(s):
+    s = s.lower()
+    s = _capitalizeRe(_doCapitalize, s)
+    return s
+
+
+def testTokenizer():
+    for filename in get_data_files('tokenizer', '*.test'):
+        with open(filename) as fp:
+            tests = json.load(fp)
+            if 'tests' in tests:
+                for index, test in enumerate(tests['tests']):
+                    if 'initialStates' not in test:
+                        test["initialStates"] = ["Data state"]
+                    if 'doubleEscaped' in test:
+                        test = unescape(test)
+                    for initialState in test["initialStates"]:
+                        test["initialState"] = capitalize(initialState)
+                        yield runTokenizerTest, test
diff --git a/planet/vendor/html5lib/tests/test_treeadapters.py b/planet/vendor/html5lib/tests/test_treeadapters.py
new file mode 100644
index 0000000..5f38b6c
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_treeadapters.py
@@ -0,0 +1,40 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import support  # flake8: noqa
+
+import html5lib
+from html5lib.treeadapters import sax
+from html5lib.treewalkers import getTreeWalker
+
+
+def test_to_sax():
+    handler = support.TracingSaxHandler()
+    tree = html5lib.parse("""<html xml:lang="en">
+        <title>Directory Listing</title>
+        <a href="/"><b/></p>
+    """, treebuilder="etree")
+    walker = getTreeWalker("etree")
+    sax.to_sax(walker(tree), handler)
+    expected = [
+        'startDocument',
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
+            'html', {(None, 'xml:lang'): 'en'}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
+        ('characters', 'Directory Listing'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
+        ('characters', '\n        '),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
+        ('startElementNS',  ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
+        ('characters', '\n    '),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
+        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
+        'endDocument',
+    ]
+    assert expected == handler.visited
diff --git a/planet/vendor/html5lib/tests/test_treewalkers.py b/planet/vendor/html5lib/tests/test_treewalkers.py
new file mode 100644
index 0000000..b775603
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_treewalkers.py
@@ -0,0 +1,353 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import sys
+import unittest
+import warnings
+from difflib import unified_diff
+
+try:
+    unittest.TestCase.assertEqual
+except AttributeError:
+    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
+
+from .support import get_data_files, TestData, convertExpected
+
+from html5lib import html5parser, treewalkers, treebuilders, constants
+
+
+def PullDOMAdapter(node):
+    from xml.dom import Node
+    from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, COMMENT, CHARACTERS
+
+    if node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
+        for childNode in node.childNodes:
+            for event in PullDOMAdapter(childNode):
+                yield event
+
+    elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
+        raise NotImplementedError("DOCTYPE nodes are not supported by PullDOM")
+
+    elif node.nodeType == Node.COMMENT_NODE:
+        yield COMMENT, node
+
+    elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
+        yield CHARACTERS, node
+
+    elif node.nodeType == Node.ELEMENT_NODE:
+        yield START_ELEMENT, node
+        for childNode in node.childNodes:
+            for event in PullDOMAdapter(childNode):
+                yield event
+        yield END_ELEMENT, node
+
+    else:
+        raise NotImplementedError("Node type not supported: " + str(node.nodeType))
+
+treeTypes = {
+    "DOM": {"builder": treebuilders.getTreeBuilder("dom"),
+            "walker": treewalkers.getTreeWalker("dom")},
+    "PullDOM": {"builder": treebuilders.getTreeBuilder("dom"),
+                "adapter": PullDOMAdapter,
+                "walker": treewalkers.getTreeWalker("pulldom")},
+}
+
+# Try whatever etree implementations are available from a list that are
+#"supposed" to work
+try:
+    import xml.etree.ElementTree as ElementTree
+except ImportError:
+    pass
+else:
+    treeTypes['ElementTree'] = \
+        {"builder": treebuilders.getTreeBuilder("etree", ElementTree),
+         "walker": treewalkers.getTreeWalker("etree", ElementTree)}
+
+try:
+    import xml.etree.cElementTree as ElementTree
+except ImportError:
+    pass
+else:
+    treeTypes['cElementTree'] = \
+        {"builder": treebuilders.getTreeBuilder("etree", ElementTree),
+         "walker": treewalkers.getTreeWalker("etree", ElementTree)}
+
+
+try:
+    import lxml.etree as ElementTree  # flake8: noqa
+except ImportError:
+    pass
+else:
+    treeTypes['lxml_native'] = \
+        {"builder": treebuilders.getTreeBuilder("lxml"),
+         "walker": treewalkers.getTreeWalker("lxml")}
+
+
+try:
+    from genshi.core import QName, Attrs
+    from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
+except ImportError:
+    pass
+else:
+    def GenshiAdapter(tree):
+        text = None
+        for token in treewalkers.getTreeWalker("dom")(tree):
+            type = token["type"]
+            if type in ("Characters", "SpaceCharacters"):
+                if text is None:
+                    text = token["data"]
+                else:
+                    text += token["data"]
+            elif text is not None:
+                yield TEXT, text, (None, -1, -1)
+                text = None
+
+            if type in ("StartTag", "EmptyTag"):
+                if token["namespace"]:
+                    name = "{%s}%s" % (token["namespace"], token["name"])
+                else:
+                    name = token["name"]
+                attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
+                               for attr, value in token["data"].items()])
+                yield (START, (QName(name), attrs), (None, -1, -1))
+                if type == "EmptyTag":
+                    type = "EndTag"
+
+            if type == "EndTag":
+                if token["namespace"]:
+                    name = "{%s}%s" % (token["namespace"], token["name"])
+                else:
+                    name = token["name"]
+
+                yield END, QName(name), (None, -1, -1)
+
+            elif type == "Comment":
+                yield COMMENT, token["data"], (None, -1, -1)
+
+            elif type == "Doctype":
+                yield DOCTYPE, (token["name"], token["publicId"],
+                                token["systemId"]), (None, -1, -1)
+
+            else:
+                pass  # FIXME: What to do?
+
+        if text is not None:
+            yield TEXT, text, (None, -1, -1)
+
+    treeTypes["genshi"] = \
+        {"builder": treebuilders.getTreeBuilder("dom"),
+         "adapter": GenshiAdapter,
+         "walker": treewalkers.getTreeWalker("genshi")}
+
+
+def concatenateCharacterTokens(tokens):
+    charactersToken = None
+    for token in tokens:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            if charactersToken is None:
+                charactersToken = {"type": "Characters", "data": token["data"]}
+            else:
+                charactersToken["data"] += token["data"]
+        else:
+            if charactersToken is not None:
+                yield charactersToken
+                charactersToken = None
+            yield token
+    if charactersToken is not None:
+        yield charactersToken
+
+
+def convertTokens(tokens):
+    output = []
+    indent = 0
+    for token in concatenateCharacterTokens(tokens):
+        type = token["type"]
+        if type in ("StartTag", "EmptyTag"):
+            if (token["namespace"] and
+                    token["namespace"] != constants.namespaces["html"]):
+                if token["namespace"] in constants.prefixes:
+                    name = constants.prefixes[token["namespace"]]
+                else:
+                    name = token["namespace"]
+                name += " " + token["name"]
+            else:
+                name = token["name"]
+            output.append("%s<%s>" % (" " * indent, name))
+            indent += 2
+            attrs = token["data"]
+            if attrs:
+                # TODO: Remove this if statement, attrs should always exist
+                for (namespace, name), value in sorted(attrs.items()):
+                    if namespace:
+                        if namespace in constants.prefixes:
+                            outputname = constants.prefixes[namespace]
+                        else:
+                            outputname = namespace
+                        outputname += " " + name
+                    else:
+                        outputname = name
+                    output.append("%s%s=\"%s\"" % (" " * indent, outputname, value))
+            if type == "EmptyTag":
+                indent -= 2
+        elif type == "EndTag":
+            indent -= 2
+        elif type == "Comment":
+            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
+        elif type == "Doctype":
+            if token["name"]:
+                if token["publicId"]:
+                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (" " * indent, token["name"],
+                                   token["publicId"],
+                                   token["systemId"] and token["systemId"] or ""))
+                elif token["systemId"]:
+                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
+                                  (" " * indent, token["name"],
+                                   token["systemId"]))
+                else:
+                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
+                                                       token["name"]))
+            else:
+                output.append("%s<!DOCTYPE >" % (" " * indent,))
+        elif type in ("Characters", "SpaceCharacters"):
+            output.append("%s\"%s\"" % (" " * indent, token["data"]))
+        else:
+            pass  # TODO: what to do with errors?
+    return "\n".join(output)
+
+import re
+attrlist = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+", re.M)
+
+
+def sortattrs(x):
+    lines = x.group(0).split("\n")
+    lines.sort()
+    return "\n".join(lines)
+
+
+class TokenTestCase(unittest.TestCase):
+    def test_all_tokens(self):
+        expected = [
+            {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
+            {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
+            {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
+            {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
+            {'data': 'a', 'type': 'Characters'},
+            {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
+            {'data': 'b', 'type': 'Characters'},
+            {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
+            {'data': 'c', 'type': 'Characters'},
+            {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
+            {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
+        ]
+        for treeName, treeCls in treeTypes.items():
+            p = html5parser.HTMLParser(tree=treeCls["builder"])
+            document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
+            document = treeCls.get("adapter", lambda x: x)(document)
+            output = treeCls["walker"](document)
+            for expectedToken, outputToken in zip(expected, output):
+                self.assertEqual(expectedToken, outputToken)
+
+
+def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
+    warnings.resetwarnings()
+    warnings.simplefilter("error")
+    try:
+        p = html5parser.HTMLParser(tree=treeClass["builder"])
+        if innerHTML:
+            document = p.parseFragment(input, innerHTML)
+        else:
+            document = p.parse(input)
+    except constants.DataLossWarning:
+        # Ignore testcases we know we don't pass
+        return
+
+    document = treeClass.get("adapter", lambda x: x)(document)
+    try:
+        output = convertTokens(treeClass["walker"](document))
+        output = attrlist.sub(sortattrs, output)
+        expected = attrlist.sub(sortattrs, convertExpected(expected))
+        diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],
+                                    [line + "\n" for line in output.splitlines()],
+                                    "Expected", "Received"))
+        assert expected == output, "\n".join([
+            "", "Input:", input,
+                "", "Expected:", expected,
+                "", "Received:", output,
+                "", "Diff:", diff,
+        ])
+    except NotImplementedError:
+        pass  # Amnesty for those that confess...
+
+
+def test_treewalker():
+    sys.stdout.write('Testing tree walkers ' + " ".join(list(treeTypes.keys())) + "\n")
+
+    for treeName, treeCls in treeTypes.items():
+        files = get_data_files('tree-construction')
+        for filename in files:
+            testName = os.path.basename(filename).replace(".dat", "")
+            if testName in ("template",):
+                continue
+
+            tests = TestData(filename, "data")
+
+            for index, test in enumerate(tests):
+                (input, errors,
+                 innerHTML, expected) = [test[key] for key in ("data", "errors",
+                                                               "document-fragment",
+                                                               "document")]
+                errors = errors.split("\n")
+                yield runTreewalkerTest, innerHTML, input, expected, errors, treeCls
+
+
+def set_attribute_on_first_child(docfrag, name, value, treeName):
+    """naively sets an attribute on the first child of the document
+    fragment passed in"""
+    setter = {'ElementTree': lambda d: d[0].set,
+              'DOM': lambda d: d.firstChild.setAttribute}
+    setter['cElementTree'] = setter['ElementTree']
+    try:
+        setter.get(treeName, setter['DOM'])(docfrag)(name, value)
+    except AttributeError:
+        setter['ElementTree'](docfrag)(name, value)
+
+
+def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
+    """tests what happens when we add attributes to the intext"""
+    treeName, treeClass = tree
+    parser = html5parser.HTMLParser(tree=treeClass["builder"])
+    document = parser.parseFragment(intext)
+    for nom, val in attrs_to_add:
+        set_attribute_on_first_child(document, nom, val, treeName)
+
+    document = treeClass.get("adapter", lambda x: x)(document)
+    output = convertTokens(treeClass["walker"](document))
+    output = attrlist.sub(sortattrs, output)
+    if not output in expected:
+        raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
+
+
+def test_treewalker_six_mix():
+    """Str/Unicode mix. If str attrs added to tree"""
+
+    # On Python 2.x string literals are of type str. Unless, like this
+    # file, the programmer imports unicode_literals from __future__.
+    # In that case, string literals become objects of type unicode.
+
+    # This test simulates a Py2 user, modifying attributes on a document
+    # fragment but not using the u'' syntax nor importing unicode_literals
+    sm_tests = [
+        ('<a href="http://example.com">Example</a>',
+         [(str('class'), str('test123'))],
+         '<a>\n  class="test123"\n  href="http://example.com"\n  "Example"'),
+
+        ('<link href="http://example.com/cow">',
+         [(str('rel'), str('alternate'))],
+         '<link>\n  href="http://example.com/cow"\n  rel="alternate"\n  "Example"')
+    ]
+
+    for tree in treeTypes.items():
+        for intext, attrs, expected in sm_tests:
+            yield runTreewalkerEditTest, intext, expected, attrs, tree
diff --git a/planet/vendor/html5lib/tests/test_whitespace_filter.py b/planet/vendor/html5lib/tests/test_whitespace_filter.py
new file mode 100644
index 0000000..9ed27fd
--- /dev/null
+++ b/planet/vendor/html5lib/tests/test_whitespace_filter.py
@@ -0,0 +1,133 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import unittest
+
+from html5lib.filters.whitespace import Filter
+from html5lib.constants import spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+try:
+    unittest.TestCase.assertEqual
+except AttributeError:
+    unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
+
+
+class TestCase(unittest.TestCase):
+    def runTest(self, input, expected):
+        output = list(Filter(input))
+        errorMsg = "\n".join(["\n\nInput:", str(input),
+                              "\nExpected:", str(expected),
+                              "\nReceived:", str(output)])
+        self.assertEqual(output, expected, errorMsg)
+
+    def runTestUnmodifiedOutput(self, input):
+        self.runTest(input, input)
+
+    def testPhrasingElements(self):
+        self.runTestUnmodifiedOutput(
+            [{"type": "Characters", "data": "This is a "},
+             {"type": "StartTag", "name": "span", "data": []},
+             {"type": "Characters", "data": "phrase"},
+             {"type": "EndTag", "name": "span", "data": []},
+             {"type": "SpaceCharacters", "data": " "},
+             {"type": "Characters", "data": "with"},
+             {"type": "SpaceCharacters", "data": " "},
+             {"type": "StartTag", "name": "em", "data": []},
+             {"type": "Characters", "data": "emphasised text"},
+             {"type": "EndTag", "name": "em", "data": []},
+             {"type": "Characters", "data": " and an "},
+             {"type": "StartTag", "name": "img", "data": [["alt", "image"]]},
+             {"type": "Characters", "data": "."}])
+
+    def testLeadingWhitespace(self):
+        self.runTest(
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "SpaceCharacters", "data": spaceCharacters},
+             {"type": "Characters", "data": "foo"},
+             {"type": "EndTag", "name": "p", "data": []}],
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "SpaceCharacters", "data": " "},
+             {"type": "Characters", "data": "foo"},
+             {"type": "EndTag", "name": "p", "data": []}])
+
+    def testLeadingWhitespaceAsCharacters(self):
+        self.runTest(
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "Characters", "data": spaceCharacters + "foo"},
+             {"type": "EndTag", "name": "p", "data": []}],
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "Characters", "data": " foo"},
+             {"type": "EndTag", "name": "p", "data": []}])
+
+    def testTrailingWhitespace(self):
+        self.runTest(
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "Characters", "data": "foo"},
+             {"type": "SpaceCharacters", "data": spaceCharacters},
+             {"type": "EndTag", "name": "p", "data": []}],
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "Characters", "data": "foo"},
+             {"type": "SpaceCharacters", "data": " "},
+             {"type": "EndTag", "name": "p", "data": []}])
+
+    def testTrailingWhitespaceAsCharacters(self):
+        self.runTest(
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "Characters", "data": "foo" + spaceCharacters},
+             {"type": "EndTag", "name": "p", "data": []}],
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "Characters", "data": "foo "},
+             {"type": "EndTag", "name": "p", "data": []}])
+
+    def testWhitespace(self):
+        self.runTest(
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
+             {"type": "EndTag", "name": "p", "data": []}],
+            [{"type": "StartTag", "name": "p", "data": []},
+             {"type": "Characters", "data": "foo bar"},
+             {"type": "EndTag", "name": "p", "data": []}])
+
+    def testLeadingWhitespaceInPre(self):
+        self.runTestUnmodifiedOutput(
+            [{"type": "StartTag", "name": "pre", "data": []},
+             {"type": "SpaceCharacters", "data": spaceCharacters},
+             {"type": "Characters", "data": "foo"},
+             {"type": "EndTag", "name": "pre", "data": []}])
+
+    def testLeadingWhitespaceAsCharactersInPre(self):
+        self.runTestUnmodifiedOutput(
+            [{"type": "StartTag", "name": "pre", "data": []},
+             {"type": "Characters", "data": spaceCharacters + "foo"},
+             {"type": "EndTag", "name": "pre", "data": []}])
+
+    def testTrailingWhitespaceInPre(self):
+        self.runTestUnmodifiedOutput(
+            [{"type": "StartTag", "name": "pre", "data": []},
+             {"type": "Characters", "data": "foo"},
+             {"type": "SpaceCharacters", "data": spaceCharacters},
+             {"type": "EndTag", "name": "pre", "data": []}])
+
+    def testTrailingWhitespaceAsCharactersInPre(self):
+        self.runTestUnmodifiedOutput(
+            [{"type": "StartTag", "name": "pre", "data": []},
+             {"type": "Characters", "data": "foo" + spaceCharacters},
+             {"type": "EndTag", "name": "pre", "data": []}])
+
+    def testWhitespaceInPre(self):
+        self.runTestUnmodifiedOutput(
+            [{"type": "StartTag", "name": "pre", "data": []},
+             {"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
+             {"type": "EndTag", "name": "pre", "data": []}])
+
+
+def buildTestSuite():
+    return unittest.defaultTestLoader.loadTestsFromName(__name__)
+
+
+def main():
+    buildTestSuite()
+    unittest.main()
+
+if __name__ == "__main__":
+    main()
diff --git a/planet/vendor/html5lib/tests/tokenizertotree.py b/planet/vendor/html5lib/tests/tokenizertotree.py
new file mode 100644
index 0000000..b841c76
--- /dev/null
+++ b/planet/vendor/html5lib/tests/tokenizertotree.py
@@ -0,0 +1,68 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+import os
+import json
+import re
+
+import html5lib
+from . import support
+from . import test_tokenizer
+
+p = html5lib.HTMLParser()
+
+unnamespaceExpected = re.compile(r"^(\|\s*)<html ([^>]+)>", re.M).sub
+
+
+def main(out_path):
+    if not os.path.exists(out_path):
+        sys.stderr.write("Path %s does not exist" % out_path)
+        sys.exit(1)
+
+    for filename in support.get_data_files('tokenizer', '*.test'):
+        run_file(filename, out_path)
+
+
+def run_file(filename, out_path):
+    try:
+        tests_data = json.load(open(filename, "r"))
+    except ValueError:
+        sys.stderr.write("Failed to load %s\n" % filename)
+        return
+    name = os.path.splitext(os.path.split(filename)[1])[0]
+    output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w")
+
+    if 'tests' in tests_data:
+        for test_data in tests_data['tests']:
+            if 'initialStates' not in test_data:
+                test_data["initialStates"] = ["Data state"]
+
+            for initial_state in test_data["initialStates"]:
+                if initial_state != "Data state":
+                    # don't support this yet
+                    continue
+                test = make_test(test_data)
+                output_file.write(test)
+
+    output_file.close()
+
+
+def make_test(test_data):
+    if 'doubleEscaped' in test_data:
+        test_data = test_tokenizer.unescape_test(test_data)
+
+    rv = []
+    rv.append("#data")
+    rv.append(test_data["input"].encode("utf8"))
+    rv.append("#errors")
+    tree = p.parse(test_data["input"])
+    output = p.tree.testSerializer(tree)
+    output = "\n".join(("| " + line[3:]) if line.startswith("|  ") else line
+                       for line in output.split("\n"))
+    output = unnamespaceExpected(r"\1<\2>", output)
+    rv.append(output.encode("utf8"))
+    rv.append("")
+    return "\n".join(rv)
+
+if __name__ == "__main__":
+    main(sys.argv[1])
diff --git a/planet/vendor/html5lib/tests/us-ascii.html b/planet/vendor/html5lib/tests/us-ascii.html
new file mode 100644
index 0000000..728cb6b
--- /dev/null
+++ b/planet/vendor/html5lib/tests/us-ascii.html
@@ -0,0 +1,3 @@
+<!doctype html>
+<title>Test</title>
+<p>Hello World!
\ No newline at end of file
diff --git a/planet/vendor/html5lib/tests/utf-8-bom.html b/planet/vendor/html5lib/tests/utf-8-bom.html
new file mode 100644
index 0000000..6ac5efc
--- /dev/null
+++ b/planet/vendor/html5lib/tests/utf-8-bom.html
@@ -0,0 +1,3 @@
+﻿<!doctype html>
+<title>Test</title>
+<p>Hello World! ©
\ No newline at end of file
diff --git a/planet/vendor/html5lib/tokenizer.py b/planet/vendor/html5lib/tokenizer.py
index d7c4b5f..7977457 100644
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
@@ -1,29 +1,27 @@
+from __future__ import absolute_import, division, unicode_literals
+
 try:
-    frozenset
+    chr = unichr # flake8: noqa
 except NameError:
-    # Import from the sets module for python 2.3
-    from sets import Set as set
-    from sets import ImmutableSet as frozenset
-try:
-    from collections import deque
-except ImportError:
-    from utils import deque
-    
-from constants import spaceCharacters
-from constants import entitiesWindows1252, entities
-from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
-from constants import digits, hexDigits, EOF
-from constants import tokenTypes, tagTokenTypes
-from constants import replacementCharacters
-
-from inputstream import HTMLInputStream
-
-# Group entities by their first character, for faster lookups
-entitiesByFirstChar = {}
-for e in entities:
-    entitiesByFirstChar.setdefault(e[0], []).append(e)
-
-class HTMLTokenizer:
+    pass
+
+from collections import deque
+
+from .constants import spaceCharacters
+from .constants import entities
+from .constants import asciiLetters, asciiUpper2Lower
+from .constants import digits, hexDigits, EOF
+from .constants import tokenTypes, tagTokenTypes
+from .constants import replacementCharacters
+
+from .inputstream import HTMLInputStream
+
+from .trie import Trie
+
+entitiesTrie = Trie(entities)
+
+
+class HTMLTokenizer(object):
     """ This class takes care of tokenizing HTML.
 
     * self.currentToken
@@ -36,17 +34,16 @@ class HTMLTokenizer:
       Points to HTMLInputStream object.
     """
 
-    # XXX need to fix documentation
-
     def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=True, lowercaseAttrName=True):
+                 lowercaseElementName=True, lowercaseAttrName=True, parser=None):
 
         self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
-        
-        #Perform case conversions?
+        self.parser = parser
+
+        # Perform case conversions?
         self.lowercaseElementName = lowercaseElementName
         self.lowercaseAttrName = lowercaseAttrName
-        
+
         # Setup the initial tokenizer state
         self.escapeFlag = False
         self.lastFourChars = []
@@ -55,6 +52,7 @@ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
 
         # The current token being created
         self.currentToken = None
+        super(HTMLTokenizer, self).__init__()
 
     def __iter__(self):
         """ This is where the magic happens.
@@ -100,78 +98,79 @@ def consumeNumberEntity(self, isHex):
         if charAsInt in replacementCharacters:
             char = replacementCharacters[charAsInt]
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "illegal-codepoint-for-numeric-entity",
-              "datavars": {"charAsInt": charAsInt}})
-        elif ((0xD800 <= charAsInt <= 0xDFFF) or 
+                                    "illegal-codepoint-for-numeric-entity",
+                                    "datavars": {"charAsInt": charAsInt}})
+        elif ((0xD800 <= charAsInt <= 0xDFFF) or
               (charAsInt > 0x10FFFF)):
-            char = u"\uFFFD"
+            char = "\uFFFD"
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "illegal-codepoint-for-numeric-entity",
-              "datavars": {"charAsInt": charAsInt}})
+                                    "illegal-codepoint-for-numeric-entity",
+                                    "datavars": {"charAsInt": charAsInt}})
         else:
-            #Should speed up this check somehow (e.g. move the set to a constant)
-            if ((0x0001 <= charAsInt <= 0x0008) or 
-                (0x000E <= charAsInt <= 0x001F) or 
-                (0x007F  <= charAsInt <= 0x009F) or
-                (0xFDD0  <= charAsInt <= 0xFDEF) or 
-                charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 
+            # Should speed up this check somehow (e.g. move the set to a constant)
+            if ((0x0001 <= charAsInt <= 0x0008) or
+                (0x000E <= charAsInt <= 0x001F) or
+                (0x007F <= charAsInt <= 0x009F) or
+                (0xFDD0 <= charAsInt <= 0xFDEF) or
+                charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
                                         0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
-                                        0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 
+                                        0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
                                         0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
                                         0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
-                                        0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 
-                                        0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 
-                                        0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 
+                                        0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
+                                        0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                                        0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
                                         0xFFFFF, 0x10FFFE, 0x10FFFF])):
-                self.tokenQueue.append({"type": tokenTypes["ParseError"], 
+                self.tokenQueue.append({"type": tokenTypes["ParseError"],
                                         "data":
-                                            "illegal-codepoint-for-numeric-entity",
+                                        "illegal-codepoint-for-numeric-entity",
                                         "datavars": {"charAsInt": charAsInt}})
             try:
                 # Try/except needed as UCS-2 Python builds' unichar only works
                 # within the BMP.
-                char = unichr(charAsInt)
+                char = chr(charAsInt)
             except ValueError:
-                char = eval("u'\\U%08x'" % charAsInt)
+                v = charAsInt - 0x10000
+                char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
 
         # Discard the ; if present. Otherwise, put it back on the queue and
         # invoke parseError on parser.
-        if c != u";":
+        if c != ";":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "numeric-entity-without-semicolon"})
+                                    "numeric-entity-without-semicolon"})
             self.stream.unget(c)
 
         return char
 
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
         # Initialise to the default output for when no entity is matched
-        output = u"&"
+        output = "&"
 
         charStack = [self.stream.char()]
-        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") 
-            or (allowedChar is not None and allowedChar == charStack[0])):
+        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
+                or (allowedChar is not None and allowedChar == charStack[0])):
             self.stream.unget(charStack[0])
 
-        elif charStack[0] == u"#":
+        elif charStack[0] == "#":
             # Read the next character to see if it's hex or decimal
             hex = False
             charStack.append(self.stream.char())
-            if charStack[-1] in (u"x", u"X"):
+            if charStack[-1] in ("x", "X"):
                 hex = True
                 charStack.append(self.stream.char())
 
             # charStack[-1] should be the first digit
             if (hex and charStack[-1] in hexDigits) \
-             or (not hex and charStack[-1] in digits):
+                    or (not hex and charStack[-1] in digits):
                 # At least one digit found, so consume the whole number
                 self.stream.unget(charStack[-1])
                 output = self.consumeNumberEntity(hex)
             else:
                 # No digits found
                 self.tokenQueue.append({"type": tokenTypes["ParseError"],
-                    "data": "expected-numeric-entity"})
+                                        "data": "expected-numeric-entity"})
                 self.stream.unget(charStack.pop())
-                output = u"&" + u"".join(charStack)
+                output = "&" + "".join(charStack)
 
         else:
             # At this point in the process might have named entity. Entities
@@ -179,50 +178,49 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
             #
             # Consume characters and compare to these to a substring of the
             # entity names in the list until the substring no longer matches.
-            filteredEntityList = entitiesByFirstChar.get(charStack[0], [])
-
-            def entitiesStartingWith(name):
-                return [e for e in filteredEntityList if e.startswith(name)]
-
-            while charStack[-1] is not EOF and\
-              entitiesStartingWith("".join(charStack)):
+            while (charStack[-1] is not EOF):
+                if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
+                    break
                 charStack.append(self.stream.char())
 
             # At this point we have a string that starts with some characters
             # that may match an entity
-            entityName = None
-
             # Try to find the longest entity the string will match to take care
             # of &noti for instance.
-            for entityLength in xrange(len(charStack)-1, 1, -1):
-                possibleEntityName = "".join(charStack[:entityLength])
-                if possibleEntityName in entities:
-                    entityName = possibleEntityName
-                    break
+            try:
+                entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
+                entityLength = len(entityName)
+            except KeyError:
+                entityName = None
 
             if entityName is not None:
                 if entityName[-1] != ";":
                     self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-                      "named-entity-without-semicolon"})
-                if entityName[-1] != ";" and fromAttribute and \
-                  (charStack[entityLength] in asciiLetters
-                  or charStack[entityLength] in digits):
+                                            "named-entity-without-semicolon"})
+                if (entityName[-1] != ";" and fromAttribute and
+                    (charStack[entityLength] in asciiLetters or
+                     charStack[entityLength] in digits or
+                     charStack[entityLength] == "=")):
                     self.stream.unget(charStack.pop())
-                    output = u"&" + u"".join(charStack)
+                    output = "&" + "".join(charStack)
                 else:
                     output = entities[entityName]
                     self.stream.unget(charStack.pop())
-                    output += u"".join(charStack[entityLength:])
+                    output += "".join(charStack[entityLength:])
             else:
                 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-                  "expected-named-entity"})
+                                        "expected-named-entity"})
                 self.stream.unget(charStack.pop())
-                output = u"&" + u"".join(charStack)
+                output = "&" + "".join(charStack)
 
         if fromAttribute:
             self.currentToken["data"][-1][1] += output
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": output})
+            if output in spaceCharacters:
+                tokenType = "SpaceCharacters"
+            else:
+                tokenType = "Characters"
+            self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
 
     def processEntityInAttribute(self, allowedChar):
         """This method replaces the need for "entityInAttributeValueState".
@@ -241,23 +239,26 @@ def emitCurrentToken(self):
                 token["name"] = token["name"].translate(asciiUpper2Lower)
             if token["type"] == tokenTypes["EndTag"]:
                 if token["data"]:
-                    self.tokenQueue.append({"type":tokenTypes["ParseError"],
-                                            "data":"attributes-in-end-tag"})
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                            "data": "attributes-in-end-tag"})
                 if token["selfClosing"]:
-                    self.tokenQueue.append({"type":tokenTypes["ParseError"],
-                                            "data":"self-closing-flag-on-end-tag"})
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                            "data": "self-closing-flag-on-end-tag"})
         self.tokenQueue.append(token)
         self.state = self.dataState
 
-
     # Below are the various tokenizer states worked out.
-
     def dataState(self):
         data = self.stream.char()
         if data == "&":
             self.state = self.entityDataState
         elif data == "<":
             self.state = self.tagOpenState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\u0000"})
         elif data is EOF:
             # Tokenization ends.
             return False
@@ -266,21 +267,21 @@ def dataState(self):
             # state". At that point spaceCharacters are important so they are
             # emitted separately.
             self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
-              data + self.stream.charsUntil(spaceCharacters, True)})
+                                    data + self.stream.charsUntil(spaceCharacters, True)})
             # No need to update lastFourChars here, since the first space will
             # have already been appended to lastFourChars and will have broken
             # any <!-- or --> sequences
         else:
-            chars = self.stream.charsUntil((u"&", u"<"))
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 
-              data + chars})
+            chars = self.stream.charsUntil(("&", "<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
         return True
 
     def entityDataState(self):
         self.consumeEntity()
         self.state = self.dataState
         return True
-    
+
     def rcdataState(self):
         data = self.stream.char()
         if data == "&":
@@ -290,93 +291,113 @@ def rcdataState(self):
         elif data == EOF:
             # Tokenization ends.
             return False
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
         elif data in spaceCharacters:
             # Directly after emitting a token you switch back to the "data
             # state". At that point spaceCharacters are important so they are
             # emitted separately.
             self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
-              data + self.stream.charsUntil(spaceCharacters, True)})
+                                    data + self.stream.charsUntil(spaceCharacters, True)})
             # No need to update lastFourChars here, since the first space will
             # have already been appended to lastFourChars and will have broken
             # any <!-- or --> sequences
         else:
-            chars = self.stream.charsUntil((u"&", u"<"))
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 
-              data + chars})
+            chars = self.stream.charsUntil(("&", "<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
         return True
 
     def characterReferenceInRcdata(self):
         self.consumeEntity()
         self.state = self.rcdataState
         return True
-    
+
     def rawtextState(self):
         data = self.stream.char()
         if data == "<":
             self.state = self.rawtextLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
         elif data == EOF:
             # Tokenization ends.
             return False
         else:
-            chars = self.stream.charsUntil((u"<"))
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 
-              data + chars})
+            chars = self.stream.charsUntil(("<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
         return True
-    
+
     def scriptDataState(self):
         data = self.stream.char()
         if data == "<":
             self.state = self.scriptDataLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
         elif data == EOF:
             # Tokenization ends.
             return False
         else:
-            chars = self.stream.charsUntil((u"<"))
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 
-              data + chars})
+            chars = self.stream.charsUntil(("<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
         return True
-    
+
     def plaintextState(self):
         data = self.stream.char()
         if data == EOF:
             # Tokenization ends.
             return False
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 
-              data + self.stream.charsUntilEOF()})
-            return True
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + self.stream.charsUntil("\u0000")})
+        return True
 
     def tagOpenState(self):
         data = self.stream.char()
-        if data == u"!":
+        if data == "!":
             self.state = self.markupDeclarationOpenState
-        elif data == u"/":
+        elif data == "/":
             self.state = self.closeTagOpenState
         elif data in asciiLetters:
-            self.currentToken = {"type": tokenTypes["StartTag"], 
+            self.currentToken = {"type": tokenTypes["StartTag"],
                                  "name": data, "data": [],
                                  "selfClosing": False,
                                  "selfClosingAcknowledged": False}
             self.state = self.tagNameState
-        elif data == u">":
+        elif data == ">":
             # XXX In theory it could be something besides a tag name. But
             # do we really care?
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-tag-name-but-got-right-bracket"})
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
+                                    "expected-tag-name-but-got-right-bracket"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
             self.state = self.dataState
-        elif data == u"?":
+        elif data == "?":
             # XXX In theory it could be something besides a tag name. But
             # do we really care?
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-tag-name-but-got-question-mark"})
+                                    "expected-tag-name-but-got-question-mark"})
             self.stream.unget(data)
             self.state = self.bogusCommentState
         else:
             # XXX
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-tag-name"})
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+                                    "expected-tag-name"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
             self.stream.unget(data)
             self.state = self.dataState
         return True
@@ -385,22 +406,22 @@ def closeTagOpenState(self):
         data = self.stream.char()
         if data in asciiLetters:
             self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.state = self.tagNameState
-        elif data == u">":
+        elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-closing-tag-but-got-right-bracket"})
+                                    "expected-closing-tag-but-got-right-bracket"})
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-closing-tag-but-got-eof"})
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+                                    "expected-closing-tag-but-got-eof"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
             self.state = self.dataState
         else:
             # XXX data can be _'_...
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-closing-tag-but-got-char",
-              "datavars": {"data": data}})
+                                    "expected-closing-tag-but-got-char",
+                                    "datavars": {"data": data}})
             self.stream.unget(data)
             self.state = self.bogusCommentState
         return True
@@ -409,292 +430,313 @@ def tagNameState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             self.state = self.beforeAttributeNameState
-        elif data == u">":
+        elif data == ">":
             self.emitCurrentToken()
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-tag-name"})
+                                    "eof-in-tag-name"})
             self.state = self.dataState
-        elif data == u"/":
+        elif data == "/":
             self.state = self.selfClosingStartTagState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["name"] += "\uFFFD"
         else:
             self.currentToken["name"] += data
             # (Don't use charsUntil here, because tag names are
             # very short and it's faster to not do anything fancy)
         return True
-    
+
     def rcdataLessThanSignState(self):
         data = self.stream.char()
         if data == "/":
             self.temporaryBuffer = ""
             self.state = self.rcdataEndTagOpenState
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
             self.stream.unget(data)
             self.state = self.rcdataState
         return True
-    
+
     def rcdataEndTagOpenState(self):
         data = self.stream.char()
         if data in asciiLetters:
             self.temporaryBuffer += data
             self.state = self.rcdataEndTagNameState
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
             self.stream.unget(data)
             self.state = self.rcdataState
         return True
-    
+
     def rcdataEndTagNameState(self):
         appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
         data = self.stream.char()
         if data in spaceCharacters and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.state = self.beforeAttributeNameState
         elif data == "/" and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.state = self.selfClosingStartTagState
         elif data == ">" and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.emitCurrentToken()
             self.state = self.dataState
         elif data in asciiLetters:
             self.temporaryBuffer += data
         else:
             self.tokenQueue.append({"type": tokenTypes["Characters"],
-                                    "data": u"</" + self.temporaryBuffer})
+                                    "data": "</" + self.temporaryBuffer})
             self.stream.unget(data)
             self.state = self.rcdataState
         return True
-    
+
     def rawtextLessThanSignState(self):
         data = self.stream.char()
         if data == "/":
             self.temporaryBuffer = ""
             self.state = self.rawtextEndTagOpenState
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
             self.stream.unget(data)
             self.state = self.rawtextState
         return True
-    
+
     def rawtextEndTagOpenState(self):
         data = self.stream.char()
         if data in asciiLetters:
             self.temporaryBuffer += data
             self.state = self.rawtextEndTagNameState
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
             self.stream.unget(data)
             self.state = self.rawtextState
         return True
-    
+
     def rawtextEndTagNameState(self):
         appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
         data = self.stream.char()
         if data in spaceCharacters and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.state = self.beforeAttributeNameState
         elif data == "/" and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.state = self.selfClosingStartTagState
         elif data == ">" and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.emitCurrentToken()
             self.state = self.dataState
         elif data in asciiLetters:
             self.temporaryBuffer += data
         else:
             self.tokenQueue.append({"type": tokenTypes["Characters"],
-                                    "data": u"</" + self.temporaryBuffer})
+                                    "data": "</" + self.temporaryBuffer})
             self.stream.unget(data)
             self.state = self.rawtextState
         return True
-    
+
     def scriptDataLessThanSignState(self):
         data = self.stream.char()
         if data == "/":
             self.temporaryBuffer = ""
             self.state = self.scriptDataEndTagOpenState
         elif data == "!":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<!"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
             self.state = self.scriptDataEscapeStartState
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
             self.stream.unget(data)
             self.state = self.scriptDataState
         return True
-    
+
     def scriptDataEndTagOpenState(self):
         data = self.stream.char()
         if data in asciiLetters:
             self.temporaryBuffer += data
             self.state = self.scriptDataEndTagNameState
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
             self.stream.unget(data)
             self.state = self.scriptDataState
         return True
-    
+
     def scriptDataEndTagNameState(self):
         appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
         data = self.stream.char()
         if data in spaceCharacters and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.state = self.beforeAttributeNameState
         elif data == "/" and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.state = self.selfClosingStartTagState
         elif data == ">" and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.emitCurrentToken()
             self.state = self.dataState
         elif data in asciiLetters:
             self.temporaryBuffer += data
         else:
             self.tokenQueue.append({"type": tokenTypes["Characters"],
-                                    "data": u"</" + self.temporaryBuffer})
+                                    "data": "</" + self.temporaryBuffer})
             self.stream.unget(data)
             self.state = self.scriptDataState
         return True
-    
+
     def scriptDataEscapeStartState(self):
         data = self.stream.char()
         if data == "-":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
             self.state = self.scriptDataEscapeStartDashState
         else:
             self.stream.unget(data)
             self.state = self.scriptDataState
         return True
-    
+
     def scriptDataEscapeStartDashState(self):
         data = self.stream.char()
         if data == "-":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
             self.state = self.scriptDataEscapedDashDashState
         else:
             self.stream.unget(data)
             self.state = self.scriptDataState
         return True
-    
+
     def scriptDataEscapedState(self):
         data = self.stream.char()
         if data == "-":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
             self.state = self.scriptDataEscapedDashState
         elif data == "<":
             self.state = self.scriptDataEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
         elif data == EOF:
             self.state = self.dataState
         else:
-            chars = self.stream.charsUntil((u"<-"))
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 
-              data + chars})
+            chars = self.stream.charsUntil(("<", "-", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
         return True
-    
+
     def scriptDataEscapedDashState(self):
         data = self.stream.char()
         if data == "-":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
             self.state = self.scriptDataEscapedDashDashState
         elif data == "<":
             self.state = self.scriptDataEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataEscapedState
         elif data == EOF:
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
             self.state = self.scriptDataEscapedState
         return True
-    
+
     def scriptDataEscapedDashDashState(self):
         data = self.stream.char()
         if data == "-":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
         elif data == "<":
             self.state = self.scriptDataEscapedLessThanSignState
         elif data == ">":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
             self.state = self.scriptDataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataEscapedState
         elif data == EOF:
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
             self.state = self.scriptDataEscapedState
         return True
-    
+
     def scriptDataEscapedLessThanSignState(self):
         data = self.stream.char()
         if data == "/":
             self.temporaryBuffer = ""
             self.state = self.scriptDataEscapedEndTagOpenState
         elif data in asciiLetters:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<" + data})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
             self.temporaryBuffer = data
             self.state = self.scriptDataDoubleEscapeStartState
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
             self.stream.unget(data)
             self.state = self.scriptDataEscapedState
         return True
-    
+
     def scriptDataEscapedEndTagOpenState(self):
         data = self.stream.char()
         if data in asciiLetters:
             self.temporaryBuffer = data
             self.state = self.scriptDataEscapedEndTagNameState
         else:
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
             self.stream.unget(data)
             self.state = self.scriptDataEscapedState
         return True
-    
+
     def scriptDataEscapedEndTagNameState(self):
         appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
         data = self.stream.char()
         if data in spaceCharacters and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.state = self.beforeAttributeNameState
         elif data == "/" and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.state = self.selfClosingStartTagState
         elif data == ">" and appropriate:
             self.currentToken = {"type": tokenTypes["EndTag"],
                                  "name": self.temporaryBuffer,
-                                 "data": [], "selfClosing":False}
+                                 "data": [], "selfClosing": False}
             self.emitCurrentToken()
             self.state = self.dataState
         elif data in asciiLetters:
             self.temporaryBuffer += data
         else:
             self.tokenQueue.append({"type": tokenTypes["Characters"],
-                                    "data": u"</" + self.temporaryBuffer})
+                                    "data": "</" + self.temporaryBuffer})
             self.stream.unget(data)
             self.state = self.scriptDataEscapedState
         return True
-    
+
     def scriptDataDoubleEscapeStartState(self):
         data = self.stream.char()
         if data in (spaceCharacters | frozenset(("/", ">"))):
@@ -710,70 +752,87 @@ def scriptDataDoubleEscapeStartState(self):
             self.stream.unget(data)
             self.state = self.scriptDataEscapedState
         return True
-    
+
     def scriptDataDoubleEscapedState(self):
         data = self.stream.char()
         if data == "-":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
             self.state = self.scriptDataDoubleEscapedDashState
         elif data == "<":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
             self.state = self.scriptDataDoubleEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
         elif data == EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-script-in-script"})
+                                    "eof-in-script-in-script"})
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
         return True
-    
+
     def scriptDataDoubleEscapedDashState(self):
         data = self.stream.char()
         if data == "-":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
             self.state = self.scriptDataDoubleEscapedDashDashState
         elif data == "<":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
             self.state = self.scriptDataDoubleEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataDoubleEscapedState
         elif data == EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-script-in-script"})
+                                    "eof-in-script-in-script"})
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
             self.state = self.scriptDataDoubleEscapedState
         return True
-    
-    def scriptDataDoubleEscapedDashState(self):
+
+    def scriptDataDoubleEscapedDashDashState(self):
         data = self.stream.char()
         if data == "-":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
         elif data == "<":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
             self.state = self.scriptDataDoubleEscapedLessThanSignState
         elif data == ">":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
             self.state = self.scriptDataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataDoubleEscapedState
         elif data == EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-script-in-script"})
+                                    "eof-in-script-in-script"})
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
             self.state = self.scriptDataDoubleEscapedState
         return True
-    
+
     def scriptDataDoubleEscapedLessThanSignState(self):
         data = self.stream.char()
         if data == "/":
-            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"/"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
             self.temporaryBuffer = ""
             self.state = self.scriptDataDoubleEscapeEndState
         else:
             self.stream.unget(data)
             self.state = self.scriptDataDoubleEscapedState
         return True
-    
+
     def scriptDataDoubleEscapeEndState(self):
         data = self.stream.char()
         if data in (spaceCharacters | frozenset(("/", ">"))):
@@ -797,18 +856,23 @@ def beforeAttributeNameState(self):
         elif data in asciiLetters:
             self.currentToken["data"].append([data, ""])
             self.state = self.attributeNameState
-        elif data == u">":
+        elif data == ">":
             self.emitCurrentToken()
-        elif data == u"/":
+        elif data == "/":
             self.state = self.selfClosingStartTagState
-        elif data in (u"'", u'"', u"=", u"<"):
+        elif data in ("'", '"', "=", "<"):
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "invalid-character-in-attribute-name"})
+                                    "invalid-character-in-attribute-name"})
             self.currentToken["data"].append([data, ""])
             self.state = self.attributeNameState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"].append(["\uFFFD", ""])
+            self.state = self.attributeNameState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-attribute-name-but-got-eof"})
+                                    "expected-attribute-name-but-got-eof"})
             self.state = self.dataState
         else:
             self.currentToken["data"].append([data, ""])
@@ -819,31 +883,36 @@ def attributeNameState(self):
         data = self.stream.char()
         leavingThisState = True
         emitToken = False
-        if data == u"=":
+        if data == "=":
             self.state = self.beforeAttributeValueState
         elif data in asciiLetters:
             self.currentToken["data"][-1][0] += data +\
-              self.stream.charsUntil(asciiLetters, True)
+                self.stream.charsUntil(asciiLetters, True)
             leavingThisState = False
-        elif data == u">":
+        elif data == ">":
             # XXX If we emit here the attributes are converted to a dict
             # without being checked and when the code below runs we error
             # because data is a dict not a list
             emitToken = True
         elif data in spaceCharacters:
             self.state = self.afterAttributeNameState
-        elif data == u"/":
+        elif data == "/":
             self.state = self.selfClosingStartTagState
-        elif data in (u"'", u'"', u"<"):
-            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "invalid-character-in-attribute-name"})
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][0] += "\uFFFD"
+            leavingThisState = False
+        elif data in ("'", '"', "<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data":
+                                    "invalid-character-in-attribute-name"})
             self.currentToken["data"][-1][0] += data
             leavingThisState = False
         elif data is EOF:
-            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-attribute-name"})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "eof-in-attribute-name"})
             self.state = self.dataState
-            emitToken = True
         else:
             self.currentToken["data"][-1][0] += data
             leavingThisState = False
@@ -858,7 +927,7 @@ def attributeNameState(self):
             for name, value in self.currentToken["data"][:-1]:
                 if self.currentToken["data"][-1][0] == name:
                     self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-                      "duplicate-attribute"})
+                                            "duplicate-attribute"})
                     break
             # XXX Fix for above XXX
             if emitToken:
@@ -869,24 +938,29 @@ def afterAttributeNameState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             self.stream.charsUntil(spaceCharacters, True)
-        elif data == u"=":
+        elif data == "=":
             self.state = self.beforeAttributeValueState
-        elif data == u">":
+        elif data == ">":
             self.emitCurrentToken()
         elif data in asciiLetters:
             self.currentToken["data"].append([data, ""])
             self.state = self.attributeNameState
-        elif data == u"/":
+        elif data == "/":
             self.state = self.selfClosingStartTagState
-        elif data in (u"'", u'"', u"<"):
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"].append(["\uFFFD", ""])
+            self.state = self.attributeNameState
+        elif data in ("'", '"', "<"):
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "invalid-character-after-attribute-name"})
+                                    "invalid-character-after-attribute-name"})
             self.currentToken["data"].append([data, ""])
             self.state = self.attributeNameState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-end-of-tag-but-got-eof"})
-            self.emitCurrentToken()
+                                    "expected-end-of-tag-but-got-eof"})
+            self.state = self.dataState
         else:
             self.currentToken["data"].append([data, ""])
             self.state = self.attributeNameState
@@ -896,26 +970,31 @@ def beforeAttributeValueState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             self.stream.charsUntil(spaceCharacters, True)
-        elif data == u"\"":
+        elif data == "\"":
             self.state = self.attributeValueDoubleQuotedState
-        elif data == u"&":
+        elif data == "&":
             self.state = self.attributeValueUnQuotedState
-            self.stream.unget(data);
-        elif data == u"'":
+            self.stream.unget(data)
+        elif data == "'":
             self.state = self.attributeValueSingleQuotedState
-        elif data == u">":
+        elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-attribute-value-but-got-right-bracket"})
+                                    "expected-attribute-value-but-got-right-bracket"})
             self.emitCurrentToken()
-        elif data in (u"=", u"<", u"`"):
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
+            self.state = self.attributeValueUnQuotedState
+        elif data in ("=", "<", "`"):
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "equals-in-unquoted-attribute-value"})
+                                    "equals-in-unquoted-attribute-value"})
             self.currentToken["data"][-1][1] += data
             self.state = self.attributeValueUnQuotedState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-attribute-value-but-got-eof"})
-            self.emitCurrentToken()
+                                    "expected-attribute-value-but-got-eof"})
+            self.state = self.dataState
         else:
             self.currentToken["data"][-1][1] += data
             self.state = self.attributeValueUnQuotedState
@@ -925,70 +1004,81 @@ def attributeValueDoubleQuotedState(self):
         data = self.stream.char()
         if data == "\"":
             self.state = self.afterAttributeValueState
-        elif data == u"&":
-            self.processEntityInAttribute(u'"')
+        elif data == "&":
+            self.processEntityInAttribute('"')
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-attribute-value-double-quote"})
-            self.emitCurrentToken()
+                                    "eof-in-attribute-value-double-quote"})
+            self.state = self.dataState
         else:
             self.currentToken["data"][-1][1] += data +\
-              self.stream.charsUntil(("\"", u"&"))
+                self.stream.charsUntil(("\"", "&", "\u0000"))
         return True
 
     def attributeValueSingleQuotedState(self):
         data = self.stream.char()
         if data == "'":
             self.state = self.afterAttributeValueState
-        elif data == u"&":
-            self.processEntityInAttribute(u"'")
+        elif data == "&":
+            self.processEntityInAttribute("'")
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-attribute-value-single-quote"})
-            self.emitCurrentToken()
+                                    "eof-in-attribute-value-single-quote"})
+            self.state = self.dataState
         else:
             self.currentToken["data"][-1][1] += data +\
-              self.stream.charsUntil(("'", u"&"))
+                self.stream.charsUntil(("'", "&", "\u0000"))
         return True
 
     def attributeValueUnQuotedState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             self.state = self.beforeAttributeNameState
-        elif data == u"&":
+        elif data == "&":
             self.processEntityInAttribute(">")
-        elif data == u">":
+        elif data == ">":
             self.emitCurrentToken()
-        elif data in (u'"', u"'", u"=", u"<", u"`"):
+        elif data in ('"', "'", "=", "<", "`"):
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-character-in-unquoted-attribute-value"})
+                                    "unexpected-character-in-unquoted-attribute-value"})
             self.currentToken["data"][-1][1] += data
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-attribute-value-no-quotes"})
-            self.emitCurrentToken()
+                                    "eof-in-attribute-value-no-quotes"})
+            self.state = self.dataState
         else:
             self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
-              frozenset((u"&", u">", u'"', u"'", u"=", u"<", u"`")) | spaceCharacters)
+                frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
         return True
 
     def afterAttributeValueState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             self.state = self.beforeAttributeNameState
-        elif data == u">":
+        elif data == ">":
             self.emitCurrentToken()
-        elif data == u"/":
+        elif data == "/":
             self.state = self.selfClosingStartTagState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-EOF-after-attribute-value"})
-            self.emitCurrentToken()
+                                    "unexpected-EOF-after-attribute-value"})
             self.stream.unget(data)
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-character-after-attribute-value"})
+                                    "unexpected-character-after-attribute-value"})
             self.stream.unget(data)
             self.state = self.beforeAttributeNameState
         return True
@@ -999,14 +1089,14 @@ def selfClosingStartTagState(self):
             self.currentToken["selfClosing"] = True
             self.emitCurrentToken()
         elif data is EOF:
-            self.tokenQueue.append({"type": tokenTypes["ParseError"], 
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
                                     "data":
-                                        "unexpected-EOF-after-solidus-in-tag"})
+                                    "unexpected-EOF-after-solidus-in-tag"})
             self.stream.unget(data)
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-character-after-soldius-in-tag"})
+                                    "unexpected-character-after-solidus-in-tag"})
             self.stream.unget(data)
             self.state = self.beforeAttributeNameState
         return True
@@ -1015,20 +1105,10 @@ def bogusCommentState(self):
         # Make a new comment token and give it as value all the characters
         # until the first > or EOF (charsUntil checks for EOF automatically)
         # and emit it.
+        data = self.stream.charsUntil(">")
+        data = data.replace("\u0000", "\uFFFD")
         self.tokenQueue.append(
-          {"type": tokenTypes["Comment"], "data": self.stream.charsUntil(u">")})
-
-        # Eat the character directly after the bogus comment which is either a
-        # ">" or an EOF.
-        self.stream.char()
-        self.state = self.dataState
-        return True
-
-    def bogusCommentContinuationState(self):
-        # Like bogusCommentState, but the caller must create the comment token
-        # and this state just adds more characters to it
-        self.currentToken["data"] += self.stream.charsUntil(u">")
-        self.tokenQueue.append(self.currentToken)
+            {"type": tokenTypes["Comment"], "data": data})
 
         # Eat the character directly after the bogus comment which is either a
         # ">" or an EOF.
@@ -1038,174 +1118,183 @@ def bogusCommentContinuationState(self):
 
     def markupDeclarationOpenState(self):
         charStack = [self.stream.char()]
-        if charStack[-1] == u"-":
+        if charStack[-1] == "-":
             charStack.append(self.stream.char())
-            if charStack[-1] == u"-":
-                self.currentToken = {"type": tokenTypes["Comment"], "data": u""}
+            if charStack[-1] == "-":
+                self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
                 self.state = self.commentStartState
                 return True
-        elif charStack[-1] in (u'd', u'D'):
+        elif charStack[-1] in ('d', 'D'):
             matched = True
-            for expected in ((u'o', u'O'), (u'c', u'C'), (u't', u'T'),
-                             (u'y', u'Y'), (u'p', u'P'), (u'e', u'E')):
+            for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
+                             ('y', 'Y'), ('p', 'P'), ('e', 'E')):
                 charStack.append(self.stream.char())
                 if charStack[-1] not in expected:
                     matched = False
                     break
             if matched:
                 self.currentToken = {"type": tokenTypes["Doctype"],
-                                     "name": u"",
-                                     "publicId": None, "systemId": None, 
+                                     "name": "",
+                                     "publicId": None, "systemId": None,
                                      "correct": True}
                 self.state = self.doctypeState
                 return True
+        elif (charStack[-1] == "[" and
+              self.parser is not None and
+              self.parser.tree.openElements and
+              self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
+            matched = True
+            for expected in ["C", "D", "A", "T", "A", "["]:
+                charStack.append(self.stream.char())
+                if charStack[-1] != expected:
+                    matched = False
+                    break
+            if matched:
+                self.state = self.cdataSectionState
+                return True
 
         self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-          "expected-dashes-or-doctype"})
-        # charStack[:-2] consists of 'safe' characters ('-', 'd', 'o', etc)
-        # so they can be copied directly into the bogus comment data, and only
-        # the last character might be '>' or EOF and needs to be ungetted
-        self.stream.unget(charStack.pop())
-        self.currentToken = {"type": tokenTypes["Comment"], 
-                             "data": u"".join(charStack)}
-        self.state = self.bogusCommentContinuationState
+                                "expected-dashes-or-doctype"})
+
+        while charStack:
+            self.stream.unget(charStack.pop())
+        self.state = self.bogusCommentState
         return True
 
     def commentStartState(self):
         data = self.stream.char()
         if data == "-":
             self.state = self.commentStartDashState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "\uFFFD"
         elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "incorrect-comment"})
+                                    "incorrect-comment"})
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-comment"})
+                                    "eof-in-comment"})
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
-            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
+            self.currentToken["data"] += data
             self.state = self.commentState
         return True
-    
+
     def commentStartDashState(self):
         data = self.stream.char()
         if data == "-":
             self.state = self.commentEndState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "-\uFFFD"
         elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "incorrect-comment"})
+                                    "incorrect-comment"})
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-comment"})
+                                    "eof-in-comment"})
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
-            self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-")
+            self.currentToken["data"] += "-" + data
             self.state = self.commentState
         return True
 
-    
     def commentState(self):
         data = self.stream.char()
-        if data == u"-":
+        if data == "-":
             self.state = self.commentEndDashState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "\uFFFD"
         elif data is EOF:
-            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-comment"})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "eof-in-comment"})
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
-            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
+            self.currentToken["data"] += data + \
+                self.stream.charsUntil(("-", "\u0000"))
         return True
 
     def commentEndDashState(self):
         data = self.stream.char()
-        if data == u"-":
+        if data == "-":
             self.state = self.commentEndState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "-\uFFFD"
+            self.state = self.commentState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-comment-end-dash"})
+                                    "eof-in-comment-end-dash"})
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
-            self.currentToken["data"] += u"-" + data +\
-              self.stream.charsUntil(u"-")
-            # Consume the next character which is either a "-" or an EOF as
-            # well so if there's a "-" directly after the "-" we go nicely to
-            # the "comment end state" without emitting a ParseError() there.
-            self.stream.char()
+            self.currentToken["data"] += "-" + data
+            self.state = self.commentState
         return True
 
     def commentEndState(self):
         data = self.stream.char()
-        if data == u">":
+        if data == ">":
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
-        elif data == u"-":
-            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-             "unexpected-dash-after-double-dash-in-comment"})
-            self.currentToken["data"] += data
-        elif data in spaceCharacters:
-            self.currentToken["data"] += "--" + data
-            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-space-after-double-dash-in-comment"})
-            self.state = self.commentEndSpaceState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "--\uFFFD"
+            self.state = self.commentState
         elif data == "!":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-bang-after-double-dash-in-comment"})
+                                    "unexpected-bang-after-double-dash-in-comment"})
             self.state = self.commentEndBangState
+        elif data == "-":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-dash-after-double-dash-in-comment"})
+            self.currentToken["data"] += data
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-comment-double-dash"})
+                                    "eof-in-comment-double-dash"})
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
             # XXX
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-comment"})
-            self.currentToken["data"] += u"--" + data
+                                    "unexpected-char-in-comment"})
+            self.currentToken["data"] += "--" + data
             self.state = self.commentState
         return True
 
     def commentEndBangState(self):
         data = self.stream.char()
-        if data == u">":
+        if data == ">":
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
-        elif data == u"-":
+        elif data == "-":
             self.currentToken["data"] += "--!"
             self.state = self.commentEndDashState
-        elif data is EOF:
-            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-comment-end-bang-state"})
-            self.tokenQueue.append(self.currentToken)
-            self.state = self.dataState
-        else:
-            self.currentToken["data"] += u"--!" + data
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "--!\uFFFD"
             self.state = self.commentState
-        return True
-
-    def commentEndSpaceState(self):
-        data = self.stream.char()
-        if data == u">":
-            self.tokenQueue.append(self.currentToken)
-            self.state = self.dataState
-        elif data == u"-":
-            self.state = self.commentEndDashState
-        elif data in spaceCharacters:
-            self.currentToken["data"] += data
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-comment-end-space-state"})
+                                    "eof-in-comment-end-bang-state"})
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
-            self.currentToken["data"] += data
+            self.currentToken["data"] += "--!" + data
             self.state = self.commentState
         return True
 
@@ -1215,13 +1304,13 @@ def doctypeState(self):
             self.state = self.beforeDoctypeNameState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-doctype-name-but-got-eof"})
+                                    "expected-doctype-name-but-got-eof"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "need-space-after-doctype"})
+                                    "need-space-after-doctype"})
             self.stream.unget(data)
             self.state = self.beforeDoctypeNameState
         return True
@@ -1230,15 +1319,20 @@ def beforeDoctypeNameState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             pass
-        elif data == u">":
+        elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-doctype-name-but-got-right-bracket"})
+                                    "expected-doctype-name-but-got-right-bracket"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["name"] = "\uFFFD"
+            self.state = self.doctypeNameState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "expected-doctype-name-but-got-eof"})
+                                    "expected-doctype-name-but-got-eof"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
@@ -1252,13 +1346,18 @@ def doctypeNameState(self):
         if data in spaceCharacters:
             self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
             self.state = self.afterDoctypeNameState
-        elif data == u">":
+        elif data == ">":
             self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["name"] += "\uFFFD"
+            self.state = self.doctypeNameState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype-name"})
+                                    "eof-in-doctype-name"})
             self.currentToken["correct"] = False
             self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
             self.tokenQueue.append(self.currentToken)
@@ -1271,21 +1370,21 @@ def afterDoctypeNameState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             pass
-        elif data == u">":
+        elif data == ">":
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
             self.currentToken["correct"] = False
             self.stream.unget(data)
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
-            if data in (u"p", u"P"):
+            if data in ("p", "P"):
                 matched = True
-                for expected in ((u"u", u"U"), (u"b", u"B"), (u"l", u"L"),
-                                 (u"i", u"I"), (u"c", u"C")):
+                for expected in (("u", "U"), ("b", "B"), ("l", "L"),
+                                 ("i", "I"), ("c", "C")):
                     data = self.stream.char()
                     if data not in expected:
                         matched = False
@@ -1293,10 +1392,10 @@ def afterDoctypeNameState(self):
                 if matched:
                     self.state = self.afterDoctypePublicKeywordState
                     return True
-            elif data in (u"s", u"S"):
+            elif data in ("s", "S"):
                 matched = True
-                for expected in ((u"y", u"Y"), (u"s", u"S"), (u"t", u"T"),
-                                 (u"e", u"E"), (u"m", u"M")):
+                for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
+                                 ("e", "E"), ("m", "M")):
                     data = self.stream.char()
                     if data not in expected:
                         matched = False
@@ -1311,25 +1410,25 @@ def afterDoctypeNameState(self):
             # and needs to be ungetted
             self.stream.unget(data)
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-                "expected-space-or-right-bracket-in-doctype", "datavars":
-                {"data": data}})
+                                    "expected-space-or-right-bracket-in-doctype", "datavars":
+                                    {"data": data}})
             self.currentToken["correct"] = False
             self.state = self.bogusDoctypeState
 
         return True
-    
+
     def afterDoctypePublicKeywordState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             self.state = self.beforeDoctypePublicIdentifierState
         elif data in ("'", '"'):
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
+                                    "unexpected-char-in-doctype"})
             self.stream.unget(data)
             self.state = self.beforeDoctypePublicIdentifierState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
@@ -1343,26 +1442,26 @@ def beforeDoctypePublicIdentifierState(self):
         if data in spaceCharacters:
             pass
         elif data == "\"":
-            self.currentToken["publicId"] = u""
+            self.currentToken["publicId"] = ""
             self.state = self.doctypePublicIdentifierDoubleQuotedState
         elif data == "'":
-            self.currentToken["publicId"] = u""
+            self.currentToken["publicId"] = ""
             self.state = self.doctypePublicIdentifierSingleQuotedState
         elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-end-of-doctype"})
+                                    "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
+                                    "unexpected-char-in-doctype"})
             self.currentToken["correct"] = False
             self.state = self.bogusDoctypeState
         return True
@@ -1371,15 +1470,19 @@ def doctypePublicIdentifierDoubleQuotedState(self):
         data = self.stream.char()
         if data == "\"":
             self.state = self.afterDoctypePublicIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["publicId"] += "\uFFFD"
         elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-end-of-doctype"})
+                                    "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
@@ -1391,15 +1494,19 @@ def doctypePublicIdentifierSingleQuotedState(self):
         data = self.stream.char()
         if data == "'":
             self.state = self.afterDoctypePublicIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["publicId"] += "\uFFFD"
         elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-end-of-doctype"})
+                                    "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
@@ -1416,27 +1523,27 @@ def afterDoctypePublicIdentifierState(self):
             self.state = self.dataState
         elif data == '"':
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
-            self.currentToken["systemId"] = u""
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["systemId"] = ""
             self.state = self.doctypeSystemIdentifierDoubleQuotedState
         elif data == "'":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
-            self.currentToken["systemId"] = u""
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["systemId"] = ""
             self.state = self.doctypeSystemIdentifierSingleQuotedState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
+                                    "unexpected-char-in-doctype"})
             self.currentToken["correct"] = False
             self.state = self.bogusDoctypeState
         return True
-    
+
     def betweenDoctypePublicAndSystemIdentifiersState(self):
         data = self.stream.char()
         if data in spaceCharacters:
@@ -1445,36 +1552,36 @@ def betweenDoctypePublicAndSystemIdentifiersState(self):
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data == '"':
-            self.currentToken["systemId"] = u""
+            self.currentToken["systemId"] = ""
             self.state = self.doctypeSystemIdentifierDoubleQuotedState
         elif data == "'":
-            self.currentToken["systemId"] = u""
+            self.currentToken["systemId"] = ""
             self.state = self.doctypeSystemIdentifierSingleQuotedState
         elif data == EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
+                                    "unexpected-char-in-doctype"})
             self.currentToken["correct"] = False
             self.state = self.bogusDoctypeState
         return True
-    
+
     def afterDoctypeSystemKeywordState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             self.state = self.beforeDoctypeSystemIdentifierState
         elif data in ("'", '"'):
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
+                                    "unexpected-char-in-doctype"})
             self.stream.unget(data)
             self.state = self.beforeDoctypeSystemIdentifierState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
@@ -1482,32 +1589,32 @@ def afterDoctypeSystemKeywordState(self):
             self.stream.unget(data)
             self.state = self.beforeDoctypeSystemIdentifierState
         return True
-    
+
     def beforeDoctypeSystemIdentifierState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             pass
         elif data == "\"":
-            self.currentToken["systemId"] = u""
+            self.currentToken["systemId"] = ""
             self.state = self.doctypeSystemIdentifierDoubleQuotedState
         elif data == "'":
-            self.currentToken["systemId"] = u""
+            self.currentToken["systemId"] = ""
             self.state = self.doctypeSystemIdentifierSingleQuotedState
         elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
+                                    "unexpected-char-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
+                                    "unexpected-char-in-doctype"})
             self.currentToken["correct"] = False
             self.state = self.bogusDoctypeState
         return True
@@ -1516,15 +1623,19 @@ def doctypeSystemIdentifierDoubleQuotedState(self):
         data = self.stream.char()
         if data == "\"":
             self.state = self.afterDoctypeSystemIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["systemId"] += "\uFFFD"
         elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-end-of-doctype"})
+                                    "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
@@ -1536,15 +1647,19 @@ def doctypeSystemIdentifierSingleQuotedState(self):
         data = self.stream.char()
         if data == "'":
             self.state = self.afterDoctypeSystemIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["systemId"] += "\uFFFD"
         elif data == ">":
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-end-of-doctype"})
+                                    "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
@@ -1561,19 +1676,19 @@ def afterDoctypeSystemIdentifierState(self):
             self.state = self.dataState
         elif data is EOF:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "eof-in-doctype"})
+                                    "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         else:
             self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
-              "unexpected-char-in-doctype"})
+                                    "unexpected-char-in-doctype"})
             self.state = self.bogusDoctypeState
         return True
 
     def bogusDoctypeState(self):
         data = self.stream.char()
-        if data == u">":
+        if data == ">":
             self.tokenQueue.append(self.currentToken)
             self.state = self.dataState
         elif data is EOF:
@@ -1584,3 +1699,33 @@ def bogusDoctypeState(self):
         else:
             pass
         return True
+
+    def cdataSectionState(self):
+        data = []
+        while True:
+            data.append(self.stream.charsUntil("]"))
+            data.append(self.stream.charsUntil(">"))
+            char = self.stream.char()
+            if char == EOF:
+                break
+            else:
+                assert char == ">"
+                if data[-1][-2:] == "]]":
+                    data[-1] = data[-1][:-2]
+                    break
+                else:
+                    data.append(char)
+
+        data = "".join(data)
+        # Deal with null here rather than in the parser
+        nullCount = data.count("\u0000")
+        if nullCount > 0:
+            for i in range(nullCount):
+                self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                        "data": "invalid-codepoint"})
+            data = data.replace("\u0000", "\uFFFD")
+        if data:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": data})
+        self.state = self.dataState
+        return True
diff --git a/planet/vendor/html5lib/treeadapters/__init__.py b/planet/vendor/html5lib/treeadapters/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/planet/vendor/html5lib/treeadapters/sax.py b/planet/vendor/html5lib/treeadapters/sax.py
new file mode 100644
index 0000000..ad47df9
--- /dev/null
+++ b/planet/vendor/html5lib/treeadapters/sax.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.sax.xmlreader import AttributesNSImpl
+
+from ..constants import adjustForeignAttributes, unadjustForeignAttributes
+
+prefix_mapping = {}
+for prefix, localName, namespace in adjustForeignAttributes.values():
+    if prefix is not None:
+        prefix_mapping[prefix] = namespace
+
+
+def to_sax(walker, handler):
+    """Call SAX-like content handler based on treewalker walker"""
+    handler.startDocument()
+    for prefix, namespace in prefix_mapping.items():
+        handler.startPrefixMapping(prefix, namespace)
+
+    for token in walker:
+        type = token["type"]
+        if type == "Doctype":
+            continue
+        elif type in ("StartTag", "EmptyTag"):
+            attrs = AttributesNSImpl(token["data"],
+                                     unadjustForeignAttributes)
+            handler.startElementNS((token["namespace"], token["name"]),
+                                   token["name"],
+                                   attrs)
+            if type == "EmptyTag":
+                handler.endElementNS((token["namespace"], token["name"]),
+                                     token["name"])
+        elif type == "EndTag":
+            handler.endElementNS((token["namespace"], token["name"]),
+                                 token["name"])
+        elif type in ("Characters", "SpaceCharacters"):
+            handler.characters(token["data"])
+        elif type == "Comment":
+            pass
+        else:
+            assert False, "Unknown token type"
+
+    for prefix, namespace in prefix_mapping.items():
+        handler.endPrefixMapping(prefix)
+    handler.endDocument()
diff --git a/planet/vendor/html5lib/treebuilders/__init__.py b/planet/vendor/html5lib/treebuilders/__init__.py
old mode 100755
new mode 100644
index 13278de..6a6b2a4
--- a/planet/vendor/html5lib/treebuilders/__init__.py
+++ b/planet/vendor/html5lib/treebuilders/__init__.py
@@ -7,7 +7,7 @@
 1) A set of classes for various types of elements: Document, Doctype,
 Comment, Element. These must implement the interface of
 _base.treebuilders.Node (although comment nodes have a different
-signature for their constructor, see treebuilders.simpletree.Comment)
+signature for their constructor, see treebuilders.etree.Comment)
 Textual content may also be implemented as another node type, or not, as
 your tree implementation requires.
 
@@ -24,71 +24,53 @@
 testSerializer method on your treebuilder which accepts a node and
 returns a string containing Node and its children serialized according
 to the format used in the unittests
-
-The supplied simpletree module provides a python-only implementation
-of a full treebuilder and is a useful reference for the semantics of
-the various methods.
 """
 
+from __future__ import absolute_import, division, unicode_literals
+
+from ..utils import default_etree
+
 treeBuilderCache = {}
 
+
 def getTreeBuilder(treeType, implementation=None, **kwargs):
     """Get a TreeBuilder class for various types of tree with built-in support
-    
+
     treeType - the name of the tree type required (case-insensitive). Supported
-               values are "simpletree", "dom", "etree" and "beautifulsoup"
-               
-               "simpletree" - a built-in DOM-ish tree type with support for some
-                              more pythonic idioms.
-                "dom" - A generic builder for DOM implementations, defaulting to
-                        a xml.dom.minidom based implementation for the sake of
-                        backwards compatibility (as releases up until 0.10 had a
-                        builder called "dom" that was a minidom implemenation).
-                "etree" - A generic builder for tree implementations exposing an
-                          elementtree-like interface (known to work with
-                          ElementTree, cElementTree and lxml.etree).
-                "beautifulsoup" - Beautiful soup (if installed)
-               
+               values are:
+
+               "dom" - A generic builder for DOM implementations, defaulting to
+                       a xml.dom.minidom based implementation.
+               "etree" - A generic builder for tree implementations exposing an
+                         ElementTree-like interface, defaulting to
+                         xml.etree.cElementTree if available and
+                         xml.etree.ElementTree if not.
+               "lxml" - A etree-based builder for lxml.etree, handling
+                        limitations of lxml's implementation.
+
     implementation - (Currently applies to the "etree" and "dom" tree types). A
                       module implementing the tree type e.g.
-                      xml.etree.ElementTree or lxml.etree."""
-    
+                      xml.etree.ElementTree or xml.etree.cElementTree."""
+
     treeType = treeType.lower()
     if treeType not in treeBuilderCache:
         if treeType == "dom":
-            import dom
-            # XXX: Keep backwards compatibility by using minidom if no implementation is given
-            if implementation == None:
+            from . import dom
+            # Come up with a sane default (pref. from the stdlib)
+            if implementation is None:
                 from xml.dom import minidom
                 implementation = minidom
-            # XXX: NEVER cache here, caching is done in the dom submodule
+            # NEVER cache here, caching is done in the dom submodule
             return dom.getDomModule(implementation, **kwargs).TreeBuilder
-        elif treeType == "simpletree":
-            import simpletree
-            treeBuilderCache[treeType] = simpletree.TreeBuilder
-        elif treeType == "beautifulsoup":
-            import soup
-            treeBuilderCache[treeType] = soup.TreeBuilder
         elif treeType == "lxml":
-            import etree_lxml
+            from . import etree_lxml
             treeBuilderCache[treeType] = etree_lxml.TreeBuilder
         elif treeType == "etree":
-            # Come up with a sane default
-            if implementation == None:
-                try:
-                    import xml.etree.cElementTree as ET
-                except ImportError:
-                    try:
-                        import xml.etree.ElementTree as ET
-                    except ImportError:
-                        try:
-                            import cElementTree as ET
-                        except ImportError:
-                            import elementtree.ElementTree as ET
-                implementation = ET
-            import etree
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
             # NEVER cache here, caching is done in the etree submodule
             return etree.getETreeModule(implementation, **kwargs).TreeBuilder
         else:
-            raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
+            raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
     return treeBuilderCache.get(treeType)
diff --git a/planet/vendor/html5lib/treebuilders/_base.py b/planet/vendor/html5lib/treebuilders/_base.py
old mode 100755
new mode 100644
index 6ea5843..8b97cc1
--- a/planet/vendor/html5lib/treebuilders/_base.py
+++ b/planet/vendor/html5lib/treebuilders/_base.py
@@ -1,25 +1,34 @@
-from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
-try:
-    frozenset
-except NameError:
-    # Import from the sets module for python 2.3
-    from sets import Set as set
-    from sets import ImmutableSet as frozenset
-
-# The scope markers are inserted when entering buttons, object elements,
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from ..constants import scopingElements, tableInsertModeElements, namespaces
+
+# The scope markers are inserted when entering object elements,
 # marquees, table cells, and table captions, and are used to prevent formatting
-# from "leaking" into tables, buttons, object elements, and marquees.
+# from "leaking" into tables, object elements, and marquees.
 Marker = None
 
+listElementsMap = {
+    None: (frozenset(scopingElements), False),
+    "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
+    "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
+                                              (namespaces["html"], "ul")])), False),
+    "table": (frozenset([(namespaces["html"], "html"),
+                         (namespaces["html"], "table")]), False),
+    "select": (frozenset([(namespaces["html"], "optgroup"),
+                          (namespaces["html"], "option")]), True)
+}
+
+
 class Node(object):
     def __init__(self, name):
         """Node representing an item in the tree.
         name - The tag name associated with the node
         parent - The parent of the current node (or None for the document node)
-        value - The value of the current node (applies to text nodes and 
+        value - The value of the current node (applies to text nodes and
         comments
         attributes - a dict holding name, value pairs for attributes of the node
-        childNodes - a list of child nodes of the current node. This must 
+        childNodes - a list of child nodes of the current node. This must
         include all elements but not necessarily other node types
         _flags - A list of miscellaneous flags that can be set on the node
         """
@@ -30,14 +39,14 @@ def __init__(self, name):
         self.childNodes = []
         self._flags = []
 
-    def __unicode__(self):
-        attributesStr =  " ".join(["%s=\"%s\""%(name, value) 
-                                   for name, value in 
-                                   self.attributes.iteritems()])
+    def __str__(self):
+        attributesStr = " ".join(["%s=\"%s\"" % (name, value)
+                                  for name, value in
+                                  self.attributes.items()])
         if attributesStr:
-            return "<%s %s>"%(self.name,attributesStr)
+            return "<%s %s>" % (self.name, attributesStr)
         else:
-            return "<%s>"%(self.name)
+            return "<%s>" % (self.name)
 
     def __repr__(self):
         return "<%s>" % (self.name)
@@ -48,14 +57,14 @@ def appendChild(self, node):
         raise NotImplementedError
 
     def insertText(self, data, insertBefore=None):
-        """Insert data as text in the current node, positioned before the 
+        """Insert data as text in the current node, positioned before the
         start of node insertBefore or to the end of the node's text.
         """
         raise NotImplementedError
 
     def insertBefore(self, node, refNode):
-        """Insert node as a child of the current node, before refNode in the 
-        list of child nodes. Raises ValueError if refNode is not a child of 
+        """Insert node as a child of the current node, before refNode in the
+        list of child nodes. Raises ValueError if refNode is not a child of
         the current node"""
         raise NotImplementedError
 
@@ -65,11 +74,11 @@ def removeChild(self, node):
         raise NotImplementedError
 
     def reparentChildren(self, newParent):
-        """Move all the children of the current node to newParent. 
-        This is needed so that trees that don't store text as nodes move the 
+        """Move all the children of the current node to newParent.
+        This is needed so that trees that don't store text as nodes move the
         text in the correct way
         """
-        #XXX - should this method be made more general?
+        # XXX - should this method be made more general?
         for child in self.childNodes:
             newParent.appendChild(child)
         self.childNodes = []
@@ -80,12 +89,36 @@ def cloneNode(self):
         """
         raise NotImplementedError
 
-
     def hasContent(self):
         """Return true if the node has children or text, false otherwise
         """
         raise NotImplementedError
 
+
+class ActiveFormattingElements(list):
+    def append(self, node):
+        equalCount = 0
+        if node != Marker:
+            for element in self[::-1]:
+                if element == Marker:
+                    break
+                if self.nodesEqual(element, node):
+                    equalCount += 1
+                if equalCount == 3:
+                    self.remove(element)
+                    break
+        list.append(self, node)
+
+    def nodesEqual(self, node1, node2):
+        if not node1.nameTuple == node2.nameTuple:
+            return False
+
+        if not node1.attributes == node2.attributes:
+            return False
+
+        return True
+
+
 class TreeBuilder(object):
     """Base treebuilder implementation
     documentClass - the class to use for the bottommost node of a document
@@ -94,19 +127,19 @@ class TreeBuilder(object):
     doctypeClass - the class to use for doctypes
     """
 
-    #Document class
+    # Document class
     documentClass = None
 
-    #The class to use for creating a node
+    # The class to use for creating a node
     elementClass = None
 
-    #The class to use for creating comments
+    # The class to use for creating comments
     commentClass = None
 
-    #The class to use for creating doctypes
+    # The class to use for creating doctypes
     doctypeClass = None
-    
-    #Fragment class
+
+    # Fragment class
     fragmentClass = None
 
     def __init__(self, namespaceHTMLElements):
@@ -115,12 +148,12 @@ def __init__(self, namespaceHTMLElements):
         else:
             self.defaultNamespace = None
         self.reset()
-    
+
     def reset(self):
         self.openElements = []
-        self.activeFormattingElements = []
+        self.activeFormattingElements = ActiveFormattingElements()
 
-        #XXX - rename these to headElement, formElement
+        # XXX - rename these to headElement, formElement
         self.headPointer = None
         self.formPointer = None
 
@@ -129,23 +162,21 @@ def reset(self):
         self.document = self.documentClass()
 
     def elementInScope(self, target, variant=None):
-        # Exit early when possible.
-        listElementsMap = {
-            None:scopingElements,
-            "list":scopingElements | set([(namespaces["html"], "ol"),
-                                          (namespaces["html"], "ul")]),
-            "table":set([(namespaces["html"], "html"),
-                         (namespaces["html"], "table")])
-            }
-        listElements = listElementsMap[variant]
+
+        # If we pass a node in we match that. if we pass a string
+        # match any node with that name
+        exactNode = hasattr(target, "nameTuple")
+
+        listElements, invert = listElementsMap[variant]
 
         for node in reversed(self.openElements):
-            if node.name == target:
+            if (node.name == target and not exactNode or
+                    node == target and exactNode):
                 return True
-            elif node.nameTuple in listElements:
+            elif (invert ^ (node.nameTuple in listElements)):
                 return False
 
-        assert False # We should never reach this point
+        assert False  # We should never reach this point
 
     def reconstructActiveFormattingElements(self):
         # Within this algorithm the order of steps described in the
@@ -165,7 +196,7 @@ def reconstructActiveFormattingElements(self):
         # Step 6
         while entry != Marker and entry not in self.openElements:
             if i == 0:
-                #This will be reset to 0 below
+                # This will be reset to 0 below
                 i = -1
                 break
             i -= 1
@@ -178,13 +209,13 @@ def reconstructActiveFormattingElements(self):
 
             # Step 8
             entry = self.activeFormattingElements[i]
-            clone = entry.cloneNode() #Mainly to get a new copy of the attributes
+            clone = entry.cloneNode()  # Mainly to get a new copy of the attributes
 
             # Step 9
-            element = self.insertElement({"type":"StartTag", 
-                                          "name":clone.name, 
-                                          "namespace":clone.namespace, 
-                                          "data":clone.attributes})
+            element = self.insertElement({"type": "StartTag",
+                                          "name": clone.name,
+                                          "namespace": clone.namespace,
+                                          "data": clone.attributes})
 
             # Step 10
             self.activeFormattingElements[i] = element
@@ -229,7 +260,7 @@ def insertComment(self, token, parent=None):
         if parent is None:
             parent = self.openElements[-1]
         parent.appendChild(self.commentClass(token["data"]))
-                           
+
     def createElement(self, token):
         """Create an element but don't insert it anywhere"""
         name = token["name"]
@@ -251,9 +282,10 @@ def _setInsertFromTable(self, value):
             self.insertElement = self.insertElementNormal
 
     insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
-        
+
     def insertElementNormal(self, token):
         name = token["name"]
+        assert isinstance(name, text_type), "Element %s not unicode" % name
         namespace = token.get("namespace", self.defaultNamespace)
         element = self.elementClass(name, namespace)
         element.attributes = token["data"]
@@ -262,13 +294,13 @@ def insertElementNormal(self, token):
         return element
 
     def insertElementTable(self, token):
-        """Create an element and insert it into the tree""" 
+        """Create an element and insert it into the tree"""
         element = self.createElement(token)
         if self.openElements[-1].name not in tableInsertModeElements:
             return self.insertElementNormal(token)
         else:
-            #We should be in the InTable mode. This means we want to do
-            #special magic element rearranging
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
             parent, insertBefore = self.getTableMisnestedNodePosition()
             if insertBefore is None:
                 parent.appendChild(element)
@@ -283,7 +315,7 @@ def insertText(self, data, parent=None):
             parent = self.openElements[-1]
 
         if (not self.insertFromTable or (self.insertFromTable and
-                                         self.openElements[-1].name 
+                                         self.openElements[-1].name
                                          not in tableInsertModeElements)):
             parent.insertText(data)
         else:
@@ -291,14 +323,14 @@ def insertText(self, data, parent=None):
             # special magic element rearranging
             parent, insertBefore = self.getTableMisnestedNodePosition()
             parent.insertText(data, insertBefore)
-            
+
     def getTableMisnestedNodePosition(self):
         """Get the foster parent element, and sibling to insert before
         (or None) when inserting a misnested table node"""
         # The foster parent element is the one which comes before the most
         # recently opened table element
         # XXX - this is really inelegant
-        lastTable=None
+        lastTable = None
         fosterParent = None
         insertBefore = None
         for elm in self.openElements[::-1]:
@@ -321,8 +353,8 @@ def getTableMisnestedNodePosition(self):
     def generateImpliedEndTags(self, exclude=None):
         name = self.openElements[-1].name
         # XXX td, th and tr are not actually needed
-        if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
-            and name != exclude):
+        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
+                and name != exclude):
             self.openElements.pop()
             # XXX This is not entirely what the specification says. We should
             # investigate it more closely.
@@ -331,10 +363,10 @@ def generateImpliedEndTags(self, exclude=None):
     def getDocument(self):
         "Return the final tree"
         return self.document
-    
+
     def getFragment(self):
         "Return the final fragment"
-        #assert self.innerHTML
+        # assert self.innerHTML
         fragment = self.fragmentClass()
         self.openElements[0].reparentChildren(fragment)
         return fragment
diff --git a/planet/vendor/html5lib/treebuilders/dom.py b/planet/vendor/html5lib/treebuilders/dom.py
index c094e1f..61e5ed7 100644
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@@ -1,40 +1,38 @@
+from __future__ import absolute_import, division, unicode_literals
 
-from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
-import new
-import re
-import weakref
 
-import _base
-from html5lib import constants, ihatexml
-from html5lib.constants import namespaces
+from xml.dom import minidom, Node
+import weakref
 
-moduleCache = {}
+from . import _base
+from .. import constants
+from ..constants import namespaces
+from ..utils import moduleFactoryFactory
 
-def getDomModule(DomImplementation):
-    name = "_" + DomImplementation.__name__+"builder"
-    if name in moduleCache:
-        return moduleCache[name]
-    else:
-        mod = new.module(name)
-        objs = getDomBuilder(DomImplementation)
-        mod.__dict__.update(objs)
-        moduleCache[name] = mod    
-        return mod
 
 def getDomBuilder(DomImplementation):
     Dom = DomImplementation
-    class AttrList:
+
+    class AttrList(object):
         def __init__(self, element):
             self.element = element
+
         def __iter__(self):
-            return self.element.attributes.items().__iter__()
+            return list(self.element.attributes.items()).__iter__()
+
         def __setitem__(self, name, value):
             self.element.setAttribute(name, value)
+
+        def __len__(self):
+            return len(list(self.element.attributes.items()))
+
         def items(self):
             return [(item[0], item[1]) for item in
-                     self.element.attributes.items()]
+                    list(self.element.attributes.items())]
+
         def keys(self):
-            return self.element.attributes.keys()
+            return list(self.element.attributes.keys())
+
         def __getitem__(self, name):
             return self.element.getAttribute(name)
 
@@ -43,68 +41,68 @@ def __contains__(self, name):
                 raise NotImplementedError
             else:
                 return self.element.hasAttribute(name)
-    
+
     class NodeBuilder(_base.Node):
         def __init__(self, element):
             _base.Node.__init__(self, element.nodeName)
             self.element = element
 
-        namespace = property(lambda self:hasattr(self.element, "namespaceURI")
+        namespace = property(lambda self: hasattr(self.element, "namespaceURI")
                              and self.element.namespaceURI or None)
 
         def appendChild(self, node):
             node.parent = self
             self.element.appendChild(node.element)
-    
+
         def insertText(self, data, insertBefore=None):
             text = self.element.ownerDocument.createTextNode(data)
             if insertBefore:
                 self.element.insertBefore(text, insertBefore.element)
             else:
                 self.element.appendChild(text)
-    
+
         def insertBefore(self, node, refNode):
             self.element.insertBefore(node.element, refNode.element)
             node.parent = self
-    
+
         def removeChild(self, node):
             if node.element.parentNode == self.element:
                 self.element.removeChild(node.element)
             node.parent = None
-    
+
         def reparentChildren(self, newParent):
             while self.element.hasChildNodes():
                 child = self.element.firstChild
                 self.element.removeChild(child)
                 newParent.element.appendChild(child)
             self.childNodes = []
-    
+
         def getAttributes(self):
             return AttrList(self.element)
-    
+
         def setAttributes(self, attributes):
             if attributes:
-                for name, value in attributes.items():
+                for name, value in list(attributes.items()):
                     if isinstance(name, tuple):
                         if name[0] is not None:
                             qualifiedName = (name[0] + ":" + name[1])
                         else:
                             qualifiedName = name[1]
-                        self.element.setAttributeNS(name[2], qualifiedName, 
+                        self.element.setAttributeNS(name[2], qualifiedName,
                                                     value)
                     else:
                         self.element.setAttribute(
                             name, value)
         attributes = property(getAttributes, setAttributes)
-    
+
         def cloneNode(self):
             return NodeBuilder(self.element.cloneNode(False))
-    
+
         def hasContent(self):
             return self.element.hasChildNodes()
 
         def getNameTuple(self):
-            if self.namespace == None:
+            if self.namespace is None:
                 return namespaces["html"], self.name
             else:
                 return self.namespace, self.name
@@ -113,9 +111,9 @@ def getNameTuple(self):
 
     class TreeBuilder(_base.TreeBuilder):
         def documentClass(self):
-            self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
+            self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
             return weakref.proxy(self)
-    
+
         def insertDoctype(self, token):
             name = token["name"]
             publicId = token["publicId"]
@@ -126,7 +124,7 @@ def insertDoctype(self, token):
             self.document.appendChild(NodeBuilder(doctype))
             if Dom == minidom:
                 doctype.ownerDocument = self.dom
-    
+
         def elementClass(self, name, namespace=None):
             if namespace is None and self.defaultNamespace is None:
                 node = self.dom.createElement(name)
@@ -134,153 +132,96 @@ def elementClass(self, name, namespace=None):
                 node = self.dom.createElementNS(namespace, name)
 
             return NodeBuilder(node)
-            
+
         def commentClass(self, data):
             return NodeBuilder(self.dom.createComment(data))
-        
+
         def fragmentClass(self):
             return NodeBuilder(self.dom.createDocumentFragment())
-    
+
         def appendChild(self, node):
             self.dom.appendChild(node.element)
-    
+
         def testSerializer(self, element):
             return testSerializer(element)
-    
+
         def getDocument(self):
             return self.dom
-        
+
         def getFragment(self):
             return _base.TreeBuilder.getFragment(self).element
-    
+
         def insertText(self, data, parent=None):
-            data=data
-            if parent <> self:
+            data = data
+            if parent != self:
                 _base.TreeBuilder.insertText(self, data, parent)
             else:
                 # HACK: allow text nodes as children of the document node
                 if hasattr(self.dom, '_child_node_types'):
                     if not Node.TEXT_NODE in self.dom._child_node_types:
-                        self.dom._child_node_types=list(self.dom._child_node_types)
+                        self.dom._child_node_types = list(self.dom._child_node_types)
                         self.dom._child_node_types.append(Node.TEXT_NODE)
                 self.dom.appendChild(self.dom.createTextNode(data))
-    
+
+        implementation = DomImplementation
         name = None
-    
+
     def testSerializer(element):
         element.normalize()
         rv = []
+
         def serializeElement(element, indent=0):
             if element.nodeType == Node.DOCUMENT_TYPE_NODE:
                 if element.name:
                     if element.publicId or element.systemId:
                         publicId = element.publicId or ""
                         systemId = element.systemId or ""
-                        rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
-                                ' '*indent, element.name, publicId, systemId))
+                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (' ' * indent, element.name, publicId, systemId))
                     else:
-                        rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
+                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
                 else:
-                    rv.append("|%s<!DOCTYPE >"%(' '*indent,))
+                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
             elif element.nodeType == Node.DOCUMENT_NODE:
                 rv.append("#document")
             elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
                 rv.append("#document-fragment")
             elif element.nodeType == Node.COMMENT_NODE:
-                rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
             elif element.nodeType == Node.TEXT_NODE:
-                rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
+                rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
             else:
                 if (hasattr(element, "namespaceURI") and
-                    element.namespaceURI != None):
-                    name = "%s %s"%(constants.prefixes[element.namespaceURI],
-                                    element.nodeName)
+                        element.namespaceURI is not None):
+                    name = "%s %s" % (constants.prefixes[element.namespaceURI],
+                                      element.nodeName)
                 else:
                     name = element.nodeName
-                rv.append("|%s<%s>"%(' '*indent, name))
+                rv.append("|%s<%s>" % (' ' * indent, name))
                 if element.hasAttributes():
-                    i = 0
-                    attr = element.attributes.item(i)
-                    while attr:
+                    attributes = []
+                    for i in range(len(element.attributes)):
+                        attr = element.attributes.item(i)
                         name = attr.nodeName
                         value = attr.value
                         ns = attr.namespaceURI
                         if ns:
-                            name = "%s %s"%(constants.prefixes[ns], attr.localName)
+                            name = "%s %s" % (constants.prefixes[ns], attr.localName)
                         else:
                             name = attr.nodeName
-                        i += 1
-                        attr = element.attributes.item(i)
+                        attributes.append((name, value))
 
-                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
             indent += 2
             for child in element.childNodes:
                 serializeElement(child, indent)
         serializeElement(element, 0)
-    
+
         return "\n".join(rv)
-    
-    def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
-      if node.nodeType == Node.ELEMENT_NODE:
-        if not nsmap:
-          handler.startElement(node.nodeName, node.attributes)
-          for child in node.childNodes: dom2sax(child, handler, nsmap)
-          handler.endElement(node.nodeName)
-        else:
-          attributes = dict(node.attributes.itemsNS()) 
-    
-          # gather namespace declarations
-          prefixes = []
-          for attrname in node.attributes.keys():
-            attr = node.getAttributeNode(attrname)
-            if (attr.namespaceURI == XMLNS_NAMESPACE or
-               (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
-              prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
-              handler.startPrefixMapping(prefix, attr.nodeValue)
-              prefixes.append(prefix)
-              nsmap = nsmap.copy()
-              nsmap[prefix] = attr.nodeValue
-              del attributes[(attr.namespaceURI, attr.nodeName)]
-    
-          # apply namespace declarations
-          for attrname in node.attributes.keys():
-            attr = node.getAttributeNode(attrname)
-            if attr.namespaceURI == None and ':' in attr.nodeName:
-              prefix = attr.nodeName.split(':')[0]
-              if nsmap.has_key(prefix):
-                del attributes[(attr.namespaceURI, attr.nodeName)]
-                attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
-    
-          # SAX events
-          ns = node.namespaceURI or nsmap.get(None,None)
-          handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
-          for child in node.childNodes: dom2sax(child, handler, nsmap)
-          handler.endElementNS((ns, node.nodeName), node.nodeName)
-          for prefix in prefixes: handler.endPrefixMapping(prefix)
-    
-      elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
-        handler.characters(node.nodeValue)
-    
-      elif node.nodeType == Node.DOCUMENT_NODE:
-        handler.startDocument()
-        for child in node.childNodes: dom2sax(child, handler, nsmap)
-        handler.endDocument()
-    
-      elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
-        for child in node.childNodes: dom2sax(child, handler, nsmap)
-    
-      else:
-        # ATTRIBUTE_NODE
-        # ENTITY_NODE
-        # PROCESSING_INSTRUCTION_NODE
-        # COMMENT_NODE
-        # DOCUMENT_TYPE_NODE
-        # NOTATION_NODE
-        pass
-        
+
     return locals()
 
-# Keep backwards compatibility with things that directly load 
-# classes/functions from this module
-for key, value in getDomModule(minidom).__dict__.items():
-	globals()[key] = value
+
+# The actual means to get a module!
+getDomModule = moduleFactoryFactory(getDomBuilder)
diff --git a/planet/vendor/html5lib/treebuilders/etree.py b/planet/vendor/html5lib/treebuilders/etree.py
old mode 100755
new mode 100644
index 62918f8..2c8ed19
--- a/planet/vendor/html5lib/treebuilders/etree.py
+++ b/planet/vendor/html5lib/treebuilders/etree.py
@@ -1,28 +1,21 @@
-import new
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
 import re
 
-import _base
-from html5lib import ihatexml
-from html5lib import constants
-from html5lib.constants import namespaces
+from . import _base
+from .. import ihatexml
+from .. import constants
+from ..constants import namespaces
+from ..utils import moduleFactoryFactory
 
 tag_regexp = re.compile("{([^}]*)}(.*)")
 
-moduleCache = {}
-
-def getETreeModule(ElementTreeImplementation, fullTree=False):
-    name = "_" + ElementTreeImplementation.__name__+"builder"
-    if name in moduleCache:
-        return moduleCache[name]
-    else:
-        mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
-        objs = getETreeBuilder(ElementTreeImplementation, fullTree)
-        mod.__dict__.update(objs)
-        moduleCache[name] = mod    
-        return mod
 
 def getETreeBuilder(ElementTreeImplementation, fullTree=False):
     ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
+
     class Element(_base.Node):
         def __init__(self, name, namespace=None):
             self._name = name
@@ -41,16 +34,16 @@ def _getETreeTag(self, name, namespace):
             if namespace is None:
                 etree_tag = name
             else:
-                etree_tag = "{%s}%s"%(namespace, name)
+                etree_tag = "{%s}%s" % (namespace, name)
             return etree_tag
-    
+
         def _setName(self, name):
             self._name = name
             self._element.tag = self._getETreeTag(self._name, self._namespace)
-        
+
         def _getName(self):
             return self._name
-        
+
         name = property(_getName, _setName)
 
         def _setNamespace(self, namespace):
@@ -61,81 +54,82 @@ def _getNamespace(self):
             return self._namespace
 
         namespace = property(_getNamespace, _setNamespace)
-    
+
         def _getAttributes(self):
             return self._element.attrib
-    
+
         def _setAttributes(self, attributes):
-            #Delete existing attributes first
-            #XXX - there may be a better way to do this...
-            for key in self._element.attrib.keys():
+            # Delete existing attributes first
+            # XXX - there may be a better way to do this...
+            for key in list(self._element.attrib.keys()):
                 del self._element.attrib[key]
-            for key, value in attributes.iteritems():
+            for key, value in attributes.items():
                 if isinstance(key, tuple):
-                    name = "{%s}%s"%(key[2], key[1])
+                    name = "{%s}%s" % (key[2], key[1])
                 else:
                     name = key
                 self._element.set(name, value)
-    
+
         attributes = property(_getAttributes, _setAttributes)
-    
+
         def _getChildNodes(self):
-            return self._childNodes    
+            return self._childNodes
+
         def _setChildNodes(self, value):
             del self._element[:]
             self._childNodes = []
             for element in value:
                 self.insertChild(element)
-    
+
         childNodes = property(_getChildNodes, _setChildNodes)
-    
+
         def hasContent(self):
             """Return true if the node has children or text"""
-            return bool(self._element.text or self._element.getchildren())
-    
+            return bool(self._element.text or len(self._element))
+
         def appendChild(self, node):
             self._childNodes.append(node)
             self._element.append(node._element)
             node.parent = self
-    
+
         def insertBefore(self, node, refNode):
-            index = self._element.getchildren().index(refNode._element)
+            index = list(self._element).index(refNode._element)
             self._element.insert(index, node._element)
             node.parent = self
-    
+
         def removeChild(self, node):
             self._element.remove(node._element)
-            node.parent=None
-    
+            node.parent = None
+
         def insertText(self, data, insertBefore=None):
             if not(len(self._element)):
                 if not self._element.text:
                     self._element.text = ""
                 self._element.text += data
             elif insertBefore is None:
-                #Insert the text as the tail of the last child element
+                # Insert the text as the tail of the last child element
                 if not self._element[-1].tail:
                     self._element[-1].tail = ""
                 self._element[-1].tail += data
             else:
-                #Insert the text before the specified node
-                children = self._element.getchildren()
+                # Insert the text before the specified node
+                children = list(self._element)
                 index = children.index(insertBefore._element)
                 if index > 0:
-                    if not self._element[index-1].tail:
-                        self._element[index-1].tail = ""
-                    self._element[index-1].tail += data
+                    if not self._element[index - 1].tail:
+                        self._element[index - 1].tail = ""
+                    self._element[index - 1].tail += data
                 else:
                     if not self._element.text:
                         self._element.text = ""
                     self._element.text += data
-    
+
         def cloneNode(self):
-            element = Element(self.name, self.namespace)
-            for name, value in self.attributes.iteritems():
+            element = type(self)(self.name, self.namespace)
+            for name, value in self.attributes.items():
                 element.attributes[name] = value
             return element
-    
+
         def reparentChildren(self, newParent):
             if newParent.childNodes:
                 newParent.childNodes[-1]._element.tail += self._element.text
@@ -146,60 +140,60 @@ def reparentChildren(self, newParent):
                     newParent._element.text += self._element.text
             self._element.text = ""
             _base.Node.reparentChildren(self, newParent)
-    
+
     class Comment(Element):
         def __init__(self, data):
-            #Use the superclass constructor to set all properties on the 
-            #wrapper element
+            # Use the superclass constructor to set all properties on the
+            # wrapper element
             self._element = ElementTree.Comment(data)
             self.parent = None
             self._childNodes = []
             self._flags = []
-            
+
         def _getData(self):
             return self._element.text
-    
+
         def _setData(self, value):
             self._element.text = value
-    
+
         data = property(_getData, _setData)
-    
+
     class DocumentType(Element):
         def __init__(self, name, publicId, systemId):
-            Element.__init__(self, "<!DOCTYPE>") 
+            Element.__init__(self, "<!DOCTYPE>")
             self._element.text = name
             self.publicId = publicId
             self.systemId = systemId
 
         def _getPublicId(self):
-            return self._element.get(u"publicId", "")
+            return self._element.get("publicId", "")
 
         def _setPublicId(self, value):
             if value is not None:
-                self._element.set(u"publicId", value)
+                self._element.set("publicId", value)
 
         publicId = property(_getPublicId, _setPublicId)
-    
+
         def _getSystemId(self):
-            return self._element.get(u"systemId", "")
+            return self._element.get("systemId", "")
 
         def _setSystemId(self, value):
             if value is not None:
-                self._element.set(u"systemId", value)
+                self._element.set("systemId", value)
 
         systemId = property(_getSystemId, _setSystemId)
-    
+
     class Document(Element):
         def __init__(self):
-            Element.__init__(self, "<DOCUMENT_ROOT>") 
-    
+            Element.__init__(self, "DOCUMENT_ROOT")
+
     class DocumentFragment(Element):
         def __init__(self):
-            Element.__init__(self, "<DOCUMENT_FRAGMENT>")
-    
+            Element.__init__(self, "DOCUMENT_FRAGMENT")
+
     def testSerializer(element):
         rv = []
-        finalText = None
+
         def serializeElement(element, indent=0):
             if not(hasattr(element, "tag")):
                 element = element.getroot()
@@ -207,19 +201,23 @@ def serializeElement(element, indent=0):
                 if element.get("publicId") or element.get("systemId"):
                     publicId = element.get("publicId") or ""
                     systemId = element.get("systemId") or ""
-                    rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
-                            element.text, publicId, systemId))
-                else:     
-                    rv.append("<!DOCTYPE %s>"%(element.text,))
-            elif element.tag == "<DOCUMENT_ROOT>":
+                    rv.append("""<!DOCTYPE %s "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
                 rv.append("#document")
-                if element.text:
-                    rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
-                if element.tail:
-                    finalText = element.tail
-            elif type(element.tag) == type(ElementTree.Comment):
-                rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+                if element.text is not None:
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+            elif element.tag == ElementTreeCommentType:
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
             else:
+                assert isinstance(element.tag, text_type), \
+                    "Expected unicode, got %s, %s" % (type(element.tag), element.tag)
                 nsmatch = tag_regexp.match(element.tag)
 
                 if nsmatch is None:
@@ -227,107 +225,113 @@ def serializeElement(element, indent=0):
                 else:
                     ns, name = nsmatch.groups()
                     prefix = constants.prefixes[ns]
-                    name = "%s %s"%(prefix, name)
-                rv.append("|%s<%s>"%(' '*indent, name))
+                    name = "%s %s" % (prefix, name)
+                rv.append("|%s<%s>" % (' ' * indent, name))
 
                 if hasattr(element, "attrib"):
-                    for name, value in element.attrib.iteritems():
+                    attributes = []
+                    for name, value in element.attrib.items():
                         nsmatch = tag_regexp.match(name)
                         if nsmatch is not None:
                             ns, name = nsmatch.groups()
                             prefix = constants.prefixes[ns]
-                            name = "%s %s"%(prefix, name)
-                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+                            attr_string = "%s %s" % (prefix, name)
+                        else:
+                            attr_string = name
+                        attributes.append((attr_string, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
                 if element.text:
-                    rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
             indent += 2
-            for child in element.getchildren():
+            for child in element:
                 serializeElement(child, indent)
             if element.tail:
-                rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
         serializeElement(element, 0)
-    
-        if finalText is not None:
-            rv.append("|%s\"%s\""%(' '*2, finalText))
-    
+
         return "\n".join(rv)
-    
+
     def tostring(element):
         """Serialize an element and its child nodes to a string"""
         rv = []
-        finalText = None
         filter = ihatexml.InfosetFilter()
+
         def serializeElement(element):
-            if type(element) == type(ElementTree.ElementTree):
+            if isinstance(element, ElementTree.ElementTree):
                 element = element.getroot()
-            
+
             if element.tag == "<!DOCTYPE>":
                 if element.get("publicId") or element.get("systemId"):
                     publicId = element.get("publicId") or ""
                     systemId = element.get("systemId") or ""
-                    rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
-                            element.text, publicId, systemId))
-                else:     
-                    rv.append("<!DOCTYPE %s>"%(element.text,))
-            elif element.tag == "<DOCUMENT_ROOT>":
-                if element.text:
+                    rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
+                if element.text is not None:
                     rv.append(element.text)
-                if element.tail:
-                    finalText = element.tail
-    
-                for child in element.getchildren():
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+
+                for child in element:
                     serializeElement(child)
-    
-            elif type(element.tag) == type(ElementTree.Comment):
-                rv.append("<!--%s-->"%(element.text,))
+
+            elif element.tag == ElementTreeCommentType:
+                rv.append("<!--%s-->" % (element.text,))
             else:
-                #This is assumed to be an ordinary element
+                # This is assumed to be an ordinary element
                 if not element.attrib:
-                    rv.append("<%s>"%(filter.fromXmlName(element.tag),))
+                    rv.append("<%s>" % (filter.fromXmlName(element.tag),))
                 else:
-                    attr = " ".join(["%s=\"%s\""%(
-                                filter.fromXmlName(name), value) 
-                                     for name, value in element.attrib.iteritems()])
-                    rv.append("<%s %s>"%(element.tag, attr))
+                    attr = " ".join(["%s=\"%s\"" % (
+                        filter.fromXmlName(name), value)
+                        for name, value in element.attrib.items()])
+                    rv.append("<%s %s>" % (element.tag, attr))
                 if element.text:
                     rv.append(element.text)
-    
-                for child in element.getchildren():
+
+                for child in element:
                     serializeElement(child)
-    
-                rv.append("</%s>"%(element.tag,))
-    
+
+                rv.append("</%s>" % (element.tag,))
+
             if element.tail:
                 rv.append(element.tail)
-    
+
         serializeElement(element)
-    
-        if finalText is not None:
-            rv.append("%s\""%(' '*2, finalText))
-    
+
         return "".join(rv)
-    
+
     class TreeBuilder(_base.TreeBuilder):
         documentClass = Document
         doctypeClass = DocumentType
         elementClass = Element
         commentClass = Comment
         fragmentClass = DocumentFragment
-    
+        implementation = ElementTreeImplementation
+
         def testSerializer(self, element):
             return testSerializer(element)
-    
+
         def getDocument(self):
             if fullTree:
                 return self.document._element
             else:
                 if self.defaultNamespace is not None:
                     return self.document._element.find(
-                        "{%s}html"%self.defaultNamespace)
+                        "{%s}html" % self.defaultNamespace)
                 else:
                     return self.document._element.find("html")
-        
+
         def getFragment(self):
             return _base.TreeBuilder.getFragment(self)._element
-        
+
     return locals()
+
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
diff --git a/planet/vendor/html5lib/treebuilders/etree_lxml.py b/planet/vendor/html5lib/treebuilders/etree_lxml.py
index 80a4005..35d08ef 100644
--- a/planet/vendor/html5lib/treebuilders/etree_lxml.py
+++ b/planet/vendor/html5lib/treebuilders/etree_lxml.py
@@ -1,20 +1,3 @@
-import new
-import warnings
-import re
-
-import _base
-from html5lib.constants import DataLossWarning
-import html5lib.constants as constants
-import etree as etree_builders
-from html5lib import ihatexml
-
-try:
-    import lxml.etree as etree
-except ImportError:
-    pass
-
-fullTree = True
-
 """Module for supporting the lxml.etree library. The idea here is to use as much
 of the native library as possible, without using fragile hacks like custom element
 names that break between releases. The downside of this is that we cannot represent
@@ -26,12 +9,34 @@
 When any of these things occur, we emit a DataLossWarning
 """
 
+from __future__ import absolute_import, division, unicode_literals
+
+import warnings
+import re
+import sys
+
+from . import _base
+from ..constants import DataLossWarning
+from .. import constants
+from . import etree as etree_builders
+from .. import ihatexml
+
+import lxml.etree as etree
+
+
+fullTree = True
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+comment_type = etree.Comment("asd").tag
+
+
 class DocumentType(object):
     def __init__(self, name, publicId, systemId):
-        self.name = name         
+        self.name = name
         self.publicId = publicId
         self.systemId = systemId
 
+
 class Document(object):
     def __init__(self):
         self._elementTree = None
@@ -42,117 +47,126 @@ def appendChild(self, element):
 
     def _getChildNodes(self):
         return self._childNodes
-    
+
     childNodes = property(_getChildNodes)
 
+
 def testSerializer(element):
     rv = []
     finalText = None
-    filter = ihatexml.InfosetFilter()
+    infosetFilter = ihatexml.InfosetFilter()
+
     def serializeElement(element, indent=0):
         if not hasattr(element, "tag"):
-            if  hasattr(element, "getroot"):
-                #Full tree case
+            if hasattr(element, "getroot"):
+                # Full tree case
                 rv.append("#document")
                 if element.docinfo.internalDTD:
-                    if not (element.docinfo.public_id or 
+                    if not (element.docinfo.public_id or
                             element.docinfo.system_url):
-                        dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
+                        dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
                     else:
-                        dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
-                            element.docinfo.root_name, 
+                        dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
+                            element.docinfo.root_name,
                             element.docinfo.public_id,
                             element.docinfo.system_url)
-                    rv.append("|%s%s"%(' '*(indent+2), dtd_str))
+                    rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
                 next_element = element.getroot()
                 while next_element.getprevious() is not None:
                     next_element = next_element.getprevious()
                 while next_element is not None:
-                    serializeElement(next_element, indent+2)
+                    serializeElement(next_element, indent + 2)
                     next_element = next_element.getnext()
-            elif isinstance(element, basestring):
-                #Text in a fragment
-                rv.append("|%s\"%s\""%(' '*indent, element))
+            elif isinstance(element, str) or isinstance(element, bytes):
+                # Text in a fragment
+                assert isinstance(element, str) or sys.version_info.major == 2
+                rv.append("|%s\"%s\"" % (' ' * indent, element))
             else:
-                #Fragment case
+                # Fragment case
                 rv.append("#document-fragment")
                 for next_element in element:
-                    serializeElement(next_element, indent+2)
-        elif type(element.tag) == type(etree.Comment):
-            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+                    serializeElement(next_element, indent + 2)
+        elif element.tag == comment_type:
+            rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
         else:
+            assert isinstance(element, etree._Element)
             nsmatch = etree_builders.tag_regexp.match(element.tag)
             if nsmatch is not None:
                 ns = nsmatch.group(1)
                 tag = nsmatch.group(2)
                 prefix = constants.prefixes[ns]
-                rv.append("|%s<%s %s>"%(' '*indent, prefix,
-                                        filter.fromXmlName(tag)))
+                rv.append("|%s<%s %s>" % (' ' * indent, prefix,
+                                          infosetFilter.fromXmlName(tag)))
             else:
-                rv.append("|%s<%s>"%(' '*indent,
-                                     filter.fromXmlName(element.tag)))
+                rv.append("|%s<%s>" % (' ' * indent,
+                                       infosetFilter.fromXmlName(element.tag)))
 
             if hasattr(element, "attrib"):
-                for name, value in element.attrib.iteritems():
-                    nsmatch = etree_builders.tag_regexp.match(name)
-                    if nsmatch:
-                        ns = nsmatch.group(1)
-                        name = nsmatch.group(2)
+                attributes = []
+                for name, value in element.attrib.items():
+                    nsmatch = tag_regexp.match(name)
+                    if nsmatch is not None:
+                        ns, name = nsmatch.groups()
+                        name = infosetFilter.fromXmlName(name)
                         prefix = constants.prefixes[ns]
-                        rv.append('|%s%s %s="%s"' % (' '*(indent+2), 
-                                                  prefix,
-                                                  filter.fromXmlName(name),
-                                                  value))
-                    else:        
-                        rv.append('|%s%s="%s"' % (' '*(indent+2), 
-                                                  filter.fromXmlName(name),
-                                                  value))
+                        attr_string = "%s %s" % (prefix, name)
+                    else:
+                        attr_string = infosetFilter.fromXmlName(name)
+                    attributes.append((attr_string, value))
+
+                for name, value in sorted(attributes):
+                    rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+
             if element.text:
-                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+                rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
             indent += 2
-            for child in element.getchildren():
+            for child in element:
                 serializeElement(child, indent)
-        if hasattr(element, "tail") and element.tail:
-            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
     serializeElement(element, 0)
 
     if finalText is not None:
-        rv.append("|%s\"%s\""%(' '*2, finalText))
+        rv.append("|%s\"%s\"" % (' ' * 2, finalText))
 
     return "\n".join(rv)
 
+
 def tostring(element):
     """Serialize an element and its child nodes to a string"""
     rv = []
     finalText = None
+
     def serializeElement(element):
         if not hasattr(element, "tag"):
             if element.docinfo.internalDTD:
                 if element.docinfo.doctype:
                     dtd_str = element.docinfo.doctype
                 else:
-                    dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
+                    dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
                 rv.append(dtd_str)
             serializeElement(element.getroot())
-            
-        elif type(element.tag) == type(etree.Comment):
-            rv.append("<!--%s-->"%(element.text,))
-        
+
+        elif element.tag == comment_type:
+            rv.append("<!--%s-->" % (element.text,))
+
         else:
-            #This is assumed to be an ordinary element
+            # This is assumed to be an ordinary element
             if not element.attrib:
-                rv.append("<%s>"%(element.tag,))
+                rv.append("<%s>" % (element.tag,))
             else:
-                attr = " ".join(["%s=\"%s\""%(name, value) 
-                                 for name, value in element.attrib.iteritems()])
-                rv.append("<%s %s>"%(element.tag, attr))
+                attr = " ".join(["%s=\"%s\"" % (name, value)
+                                 for name, value in element.attrib.items()])
+                rv.append("<%s %s>" % (element.tag, attr))
             if element.text:
                 rv.append(element.text)
 
-            for child in element.getchildren():
+            for child in element:
                 serializeElement(child)
 
-            rv.append("</%s>"%(element.tag,))
+            rv.append("</%s>" % (element.tag,))
 
         if hasattr(element, "tail") and element.tail:
             rv.append(element.tail)
@@ -160,56 +174,57 @@ def serializeElement(element):
     serializeElement(element)
 
     if finalText is not None:
-        rv.append("%s\""%(' '*2, finalText))
+        rv.append("%s\"" % (' ' * 2, finalText))
 
     return "".join(rv)
-        
+
 
 class TreeBuilder(_base.TreeBuilder):
     documentClass = Document
     doctypeClass = DocumentType
     elementClass = None
     commentClass = None
-    fragmentClass = Document    
+    fragmentClass = Document
+    implementation = etree
 
-    def __init__(self, namespaceHTMLElements, fullTree = False):
+    def __init__(self, namespaceHTMLElements, fullTree=False):
         builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
-        filter = self.filter = ihatexml.InfosetFilter()
+        infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
         self.namespaceHTMLElements = namespaceHTMLElements
 
         class Attributes(dict):
             def __init__(self, element, value={}):
                 self._element = element
                 dict.__init__(self, value)
-                for key, value in self.iteritems():
+                for key, value in self.items():
                     if isinstance(key, tuple):
-                        name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                        name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
                     else:
-                        name = filter.coerceAttribute(key)
+                        name = infosetFilter.coerceAttribute(key)
                     self._element._element.attrib[name] = value
 
             def __setitem__(self, key, value):
                 dict.__setitem__(self, key, value)
                 if isinstance(key, tuple):
-                    name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                    name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
                 else:
-                    name = filter.coerceAttribute(key)
+                    name = infosetFilter.coerceAttribute(key)
                 self._element._element.attrib[name] = value
 
         class Element(builder.Element):
             def __init__(self, name, namespace):
-                name = filter.coerceElement(name)
+                name = infosetFilter.coerceElement(name)
                 builder.Element.__init__(self, name, namespace=namespace)
                 self._attributes = Attributes(self)
 
             def _setName(self, name):
-                self._name = filter.coerceElement(name)
+                self._name = infosetFilter.coerceElement(name)
                 self._element.tag = self._getETreeTag(
                     self._name, self._namespace)
-        
+
             def _getName(self):
-                return filter.fromXmlName(self._name)
-        
+                return infosetFilter.fromXmlName(self._name)
+
             name = property(_getName, _setName)
 
             def _getAttributes(self):
@@ -217,24 +232,23 @@ def _getAttributes(self):
 
             def _setAttributes(self, attributes):
                 self._attributes = Attributes(self, attributes)
-    
+
             attributes = property(_getAttributes, _setAttributes)
 
             def insertText(self, data, insertBefore=None):
-                data = filter.coerceCharacters(data)
+                data = infosetFilter.coerceCharacters(data)
                 builder.Element.insertText(self, data, insertBefore)
 
             def appendChild(self, child):
                 builder.Element.appendChild(self, child)
-                
 
         class Comment(builder.Comment):
             def __init__(self, data):
-                data = filter.coerceComment(data)
+                data = infosetFilter.coerceComment(data)
                 builder.Comment.__init__(self, data)
 
             def _setData(self, data):
-                data = filter.coerceComment(data)
+                data = infosetFilter.coerceComment(data)
                 self._element.text = data
 
             def _getData(self):
@@ -244,9 +258,9 @@ def _getData(self):
 
         self.elementClass = Element
         self.commentClass = builder.Comment
-        #self.fragmentClass = builder.DocumentFragment
+        # self.fragmentClass = builder.DocumentFragment
         _base.TreeBuilder.__init__(self, namespaceHTMLElements)
-    
+
     def reset(self):
         _base.TreeBuilder.reset(self)
         self.insertComment = self.insertCommentInitial
@@ -261,13 +275,13 @@ def getDocument(self):
             return self.document._elementTree
         else:
             return self.document._elementTree.getroot()
-    
+
     def getFragment(self):
         fragment = []
         element = self.openElements[0]._element
         if element.text:
             fragment.append(element.text)
-        fragment.extend(element.getchildren())
+        fragment.extend(list(element))
         if element.tail:
             fragment.append(element.tail)
         return fragment
@@ -277,59 +291,79 @@ def insertDoctype(self, token):
         publicId = token["publicId"]
         systemId = token["systemId"]
 
-        if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
-            warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
+        if not name:
+            warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
+            self.doctype = None
+        else:
+            coercedName = self.infosetFilter.coerceElement(name)
+            if coercedName != name:
+                warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
+
+            doctype = self.doctypeClass(coercedName, publicId, systemId)
+            self.doctype = doctype
 
-        doctype = self.doctypeClass(name, publicId, systemId)
-        self.doctype = doctype
-    
     def insertCommentInitial(self, data, parent=None):
         self.initial_comments.append(data)
-    
+
+    def insertCommentMain(self, data, parent=None):
+        if (parent == self.document and
+                self.document._elementTree.getroot()[-1].tag == comment_type):
+                warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+        super(TreeBuilder, self).insertComment(data, parent)
+
     def insertRoot(self, token):
         """Create the document root"""
-        #Because of the way libxml2 works, it doesn't seem to be possible to
-        #alter information like the doctype after the tree has been parsed. 
-        #Therefore we need to use the built-in parser to create our iniial 
-        #tree, after which we can add elements like normal
+        # Because of the way libxml2 works, it doesn't seem to be possible to
+        # alter information like the doctype after the tree has been parsed.
+        # Therefore we need to use the built-in parser to create our iniial
+        # tree, after which we can add elements like normal
         docStr = ""
-        if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
-            docStr += "<!DOCTYPE %s"%self.doctype.name
-            if (self.doctype.publicId is not None or 
-                self.doctype.systemId is not None):
-                docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
-                                               self.doctype.systemId or "")
+        if self.doctype:
+            assert self.doctype.name
+            docStr += "<!DOCTYPE %s" % self.doctype.name
+            if (self.doctype.publicId is not None or
+                    self.doctype.systemId is not None):
+                docStr += (' PUBLIC "%s" ' %
+                           (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
+                if self.doctype.systemId:
+                    sysid = self.doctype.systemId
+                    if sysid.find("'") >= 0 and sysid.find('"') >= 0:
+                        warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
+                        sysid = sysid.replace("'", 'U00027')
+                    if sysid.find("'") >= 0:
+                        docStr += '"%s"' % sysid
+                    else:
+                        docStr += "'%s'" % sysid
+                else:
+                    docStr += "''"
             docStr += ">"
+            if self.doctype.name != token["name"]:
+                warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
         docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
-        
-        try:
-            root = etree.fromstring(docStr)
-        except etree.XMLSyntaxError:
-            print docStr
-            raise
-        
-        #Append the initial comments:
+        root = etree.fromstring(docStr)
+
+        # Append the initial comments:
         for comment_token in self.initial_comments:
             root.addprevious(etree.Comment(comment_token["data"]))
-        
-        #Create the root document and add the ElementTree to it
+
+        # Create the root document and add the ElementTree to it
         self.document = self.documentClass()
         self.document._elementTree = root.getroottree()
-        
+
         # Give the root element the right name
         name = token["name"]
         namespace = token.get("namespace", self.defaultNamespace)
         if namespace is None:
             etree_tag = name
         else:
-            etree_tag = "{%s}%s"%(namespace, name)
+            etree_tag = "{%s}%s" % (namespace, name)
         root.tag = etree_tag
-        
-        #Add the root element to the internal child/open data structures
+
+        # Add the root element to the internal child/open data structures
         root_element = self.elementClass(name, namespace)
         root_element._element = root
         self.document._childNodes.append(root_element)
         self.openElements.append(root_element)
-    
-        #Reset to the default insert comment function
-        self.insertComment = super(TreeBuilder, self).insertComment
+
+        # Reset to the default insert comment function
+        self.insertComment = self.insertCommentMain
diff --git a/planet/vendor/html5lib/treebuilders/simpletree.py b/planet/vendor/html5lib/treebuilders/simpletree.py
deleted file mode 100755
index ff6bfe4..0000000
--- a/planet/vendor/html5lib/treebuilders/simpletree.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import _base
-from html5lib.constants import voidElements, namespaces, prefixes
-from xml.sax.saxutils import escape
-
-# Really crappy basic implementation of a DOM-core like thing
-class Node(_base.Node):
-    type = -1
-    def __init__(self, name):
-        self.name = name
-        self.parent = None
-        self.value = None
-        self.childNodes = []
-        self._flags = []
-
-    def __iter__(self):
-        for node in self.childNodes:
-            yield node
-            for item in node:
-                yield item
-
-    def __unicode__(self):
-        return self.name
-
-    def toxml(self):
-        raise NotImplementedError
-
-    def printTree(self, indent=0):
-        tree = '\n|%s%s' % (' '* indent, unicode(self))
-        for child in self.childNodes:
-            tree += child.printTree(indent + 2)
-        return tree
-
-    def appendChild(self, node):
-        if (isinstance(node, TextNode) and self.childNodes and
-          isinstance(self.childNodes[-1], TextNode)):
-            self.childNodes[-1].value += node.value
-        else:
-            self.childNodes.append(node)
-        node.parent = self
-
-    def insertText(self, data, insertBefore=None):
-        if insertBefore is None:
-            self.appendChild(TextNode(data))
-        else:
-            self.insertBefore(TextNode(data), insertBefore)
-
-    def insertBefore(self, node, refNode):
-        index = self.childNodes.index(refNode)
-        if (isinstance(node, TextNode) and index > 0 and
-          isinstance(self.childNodes[index - 1], TextNode)):
-            self.childNodes[index - 1].value += node.value
-        else:
-            self.childNodes.insert(index, node)
-        node.parent = self
-
-    def removeChild(self, node):
-        try:
-            self.childNodes.remove(node)
-        except:
-            # XXX
-            raise
-        node.parent = None
-
-    def cloneNode(self):
-        raise NotImplementedError
-
-    def hasContent(self):
-        """Return true if the node has children or text"""
-        return bool(self.childNodes)
-
-    def getNameTuple(self):
-        if self.namespace == None:
-            return namespaces["html"], self.name
-        else:
-            return self.namespace, self.name
-
-    nameTuple = property(getNameTuple)
-
-class Document(Node):
-    type = 1
-    def __init__(self):
-        Node.__init__(self, None)
-
-    def __unicode__(self):
-        return "#document"
-
-    def appendChild(self, child):
-        Node.appendChild(self, child)
-
-    def toxml(self, encoding="utf=8"):
-        result = ""
-        for child in self.childNodes:
-            result += child.toxml()
-        return result.encode(encoding)
-
-    def hilite(self, encoding="utf-8"):
-        result = "<pre>"
-        for child in self.childNodes:
-            result += child.hilite()
-        return result.encode(encoding) + "</pre>"
-    
-    def printTree(self):
-        tree = unicode(self)
-        for child in self.childNodes:
-            tree += child.printTree(2)
-        return tree
-
-    def cloneNode(self):
-        return Document()
-
-class DocumentFragment(Document):
-    type = 2
-    def __unicode__(self):
-        return "#document-fragment"
-
-    def cloneNode(self):
-        return DocumentFragment()
-
-class DocumentType(Node):
-    type = 3
-    def __init__(self, name, publicId, systemId):
-        Node.__init__(self, name)
-        self.publicId = publicId
-        self.systemId = systemId
-
-    def __unicode__(self):
-        if self.publicId or self.systemId:
-            publicId = self.publicId or ""
-            systemId = self.systemId or ""
-            return """<!DOCTYPE %s "%s" "%s">"""%(
-                self.name, publicId, systemId)
-                            
-        else:
-            return u"<!DOCTYPE %s>" % self.name
-    
-
-    toxml = __unicode__
-    
-    def hilite(self):
-        return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
-
-    def cloneNode(self):
-        return DocumentType(self.name, self.publicId, self.systemId)
-
-class TextNode(Node):
-    type = 4
-    def __init__(self, value):
-        Node.__init__(self, None)
-        self.value = value
-
-    def __unicode__(self):
-        return u"\"%s\"" % self.value
-
-    def toxml(self):
-        return escape(self.value)
-    
-    hilite = toxml
-
-    def cloneNode(self):
-        return TextNode(self.value)
-
-class Element(Node):
-    type = 5
-    def __init__(self, name, namespace=None):
-        Node.__init__(self, name)
-        self.namespace = namespace
-        self.attributes = {}
-
-    def __unicode__(self):
-        if self.namespace == None:
-            return u"<%s>" % self.name
-        else:
-            return u"<%s %s>"%(prefixes[self.namespace], self.name)
-
-    def toxml(self):
-        result = '<' + self.name
-        if self.attributes:
-            for name,value in self.attributes.iteritems():
-                result += u' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
-        if self.childNodes:
-            result += '>'
-            for child in self.childNodes:
-                result += child.toxml()
-            result += u'</%s>' % self.name
-        else:
-            result += u'/>'
-        return result
-    
-    def hilite(self):
-        result = '&lt;<code class="markup element-name">%s</code>' % self.name
-        if self.attributes:
-            for name, value in self.attributes.iteritems():
-                result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
-        if self.childNodes:
-            result += ">"
-            for child in self.childNodes:
-                result += child.hilite()
-        elif self.name in voidElements:
-            return result + ">"
-        return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
-
-    def printTree(self, indent):
-        tree = '\n|%s%s' % (' '*indent, unicode(self))
-        indent += 2
-        if self.attributes:
-            for name, value in self.attributes.iteritems():
-                if isinstance(name, tuple):
-                    name = "%s %s"%(name[0], name[1])
-                tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
-        for child in self.childNodes:
-            tree += child.printTree(indent)
-        return tree
-
-    def cloneNode(self):
-        newNode = Element(self.name)
-        if hasattr(self, 'namespace'):
-            newNode.namespace = self.namespace
-        for attr, value in self.attributes.iteritems():
-            newNode.attributes[attr] = value
-        return newNode
-
-class CommentNode(Node):
-    type = 6
-    def __init__(self, data):
-        Node.__init__(self, None)
-        self.data = data
-
-    def __unicode__(self):
-        return "<!-- %s -->" % self.data
-    
-    def toxml(self):
-        return "<!--%s-->" % self.data
-
-    def hilite(self):
-        return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
-
-    def cloneNode(self):
-        return CommentNode(self.data)
-
-class TreeBuilder(_base.TreeBuilder):
-    documentClass = Document
-    doctypeClass = DocumentType
-    elementClass = Element
-    commentClass = CommentNode
-    fragmentClass = DocumentFragment
-    
-    def testSerializer(self, node):
-        return node.printTree()
diff --git a/planet/vendor/html5lib/treebuilders/soup.py b/planet/vendor/html5lib/treebuilders/soup.py
deleted file mode 100644
index bca2baf..0000000
--- a/planet/vendor/html5lib/treebuilders/soup.py
+++ /dev/null
@@ -1,228 +0,0 @@
-import warnings
-
-warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
-
-from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
-
-import _base
-from html5lib.constants import namespaces, DataLossWarning
-
-class AttrList(object):
-    def __init__(self, element):
-        self.element = element
-        self.attrs = dict(self.element.attrs)
-    def __iter__(self):
-        return self.attrs.items().__iter__()
-    def __setitem__(self, name, value):
-        "set attr", name, value
-        self.element[name] = value
-    def items(self):
-        return self.attrs.items()
-    def keys(self):
-        return self.attrs.keys()
-    def __getitem__(self, name):
-        return self.attrs[name]
-    def __contains__(self, name):
-        return name in self.attrs.keys()
-
-
-class Element(_base.Node):
-    def __init__(self, element, soup, namespace):
-        _base.Node.__init__(self, element.name)
-        self.element = element
-        self.soup = soup
-        self.namespace = namespace
-
-    def _nodeIndex(self, node, refNode):
-        # Finds a node by identity rather than equality
-        for index in range(len(self.element.contents)):
-            if id(self.element.contents[index]) == id(refNode.element):
-                return index
-        return None
-
-    def appendChild(self, node):
-        if (node.element.__class__ == NavigableString and self.element.contents
-            and self.element.contents[-1].__class__ == NavigableString):
-            # Concatenate new text onto old text node
-            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
-            newStr = NavigableString(self.element.contents[-1]+node.element)
-
-            # Remove the old text node
-            # (Can't simply use .extract() by itself, because it fails if
-            # an equal text node exists within the parent node)
-            oldElement = self.element.contents[-1]
-            del self.element.contents[-1]
-            oldElement.parent = None
-            oldElement.extract()
-
-            self.element.insert(len(self.element.contents), newStr)
-        else:
-            self.element.insert(len(self.element.contents), node.element)
-            node.parent = self
-
-    def getAttributes(self):
-        return AttrList(self.element)
-
-    def setAttributes(self, attributes):
-        if attributes:
-            for name, value in attributes.items():
-                self.element[name] =  value
-
-    attributes = property(getAttributes, setAttributes)
-    
-    def insertText(self, data, insertBefore=None):
-        text = TextNode(NavigableString(data), self.soup)
-        if insertBefore:
-            self.insertBefore(text, insertBefore)
-        else:
-            self.appendChild(text)
-
-    def insertBefore(self, node, refNode):
-        index = self._nodeIndex(node, refNode)
-        if (node.element.__class__ == NavigableString and self.element.contents
-            and self.element.contents[index-1].__class__ == NavigableString):
-            # (See comments in appendChild)
-            newStr = NavigableString(self.element.contents[index-1]+node.element)
-            oldNode = self.element.contents[index-1]
-            del self.element.contents[index-1]
-            oldNode.parent = None
-            oldNode.extract()
-
-            self.element.insert(index-1, newStr)
-        else:
-            self.element.insert(index, node.element)
-            node.parent = self
-
-    def removeChild(self, node):
-        index = self._nodeIndex(node.parent, node)
-        del node.parent.element.contents[index]
-        node.element.parent = None
-        node.element.extract()
-        node.parent = None
-
-    def reparentChildren(self, newParent):
-        while self.element.contents:
-            child = self.element.contents[0]
-            child.extract()
-            if isinstance(child, Tag):
-                newParent.appendChild(Element(child, self.soup, namespaces["html"]))
-            else:
-                newParent.appendChild(TextNode(child, self.soup))
-
-    def cloneNode(self):
-        node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
-        for key,value in self.attributes:
-            node.attributes[key] = value
-        return node
-
-    def hasContent(self):
-        return self.element.contents
-
-    def getNameTuple(self):
-        if self.namespace == None:
-            return namespaces["html"], self.name
-        else:
-            return self.namespace, self.name
-
-    nameTuple = property(getNameTuple)
-
-class TextNode(Element):
-    def __init__(self, element, soup):
-        _base.Node.__init__(self, None)
-        self.element = element
-        self.soup = soup
-    
-    def cloneNode(self):
-        raise NotImplementedError
-
-class TreeBuilder(_base.TreeBuilder):
-    def __init__(self, namespaceHTMLElements):
-        if namespaceHTMLElements:
-            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
-        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
-        
-    def documentClass(self):
-        self.soup = BeautifulSoup("")
-        return Element(self.soup, self.soup, None)
-    
-    def insertDoctype(self, token):
-        name = token["name"]
-        publicId = token["publicId"]
-        systemId = token["systemId"]
-
-        if publicId:
-            self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
-        elif systemId:
-            self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""%
-                                            (name, systemId)))
-        else:
-            self.soup.insert(0, Declaration("DOCTYPE %s"%name))
-    
-    def elementClass(self, name, namespace):
-        if namespace is not None:
-            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
-        return Element(Tag(self.soup, name), self.soup, namespace)
-        
-    def commentClass(self, data):
-        return TextNode(Comment(data), self.soup)
-    
-    def fragmentClass(self):
-        self.soup = BeautifulSoup("")
-        self.soup.name = "[document_fragment]"
-        return Element(self.soup, self.soup, None) 
-
-    def appendChild(self, node):
-        self.soup.insert(len(self.soup.contents), node.element)
-
-    def testSerializer(self, element):
-        return testSerializer(element)
-
-    def getDocument(self):
-        return self.soup
-    
-    def getFragment(self):
-        return _base.TreeBuilder.getFragment(self).element
-    
-def testSerializer(element):
-    import re
-    rv = []
-    def serializeElement(element, indent=0):
-        if isinstance(element, Declaration):
-            doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
-            m = re.compile(doctype_regexp).match(element.string)
-            assert m is not None, "DOCTYPE did not match expected format"
-            name = m.group('name')
-            publicId = m.group('publicId')
-            if publicId is not None:
-                systemId = m.group('systemId1') or ""
-            else:
-                systemId = m.group('systemId2')
-
-            if publicId is not None or systemId is not None:
-                rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
-                          (' '*indent, name, publicId or "", systemId or ""))
-            else:
-                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
-            
-        elif isinstance(element, BeautifulSoup):
-            if element.name == "[document_fragment]":
-                rv.append("#document-fragment")                
-            else:
-                rv.append("#document")
-
-        elif isinstance(element, Comment):
-            rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
-        elif isinstance(element, unicode):
-            rv.append("|%s\"%s\"" %(' '*indent, element))
-        else:
-            rv.append("|%s<%s>"%(' '*indent, element.name))
-            if element.attrs:
-                for name, value in element.attrs:
-                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
-        indent += 2
-        if hasattr(element, "contents"):
-            for child in element.contents:
-                serializeElement(child, indent)
-    serializeElement(element, 0)
-
-    return "\n".join(rv)
diff --git a/planet/vendor/html5lib/treewalkers/__init__.py b/planet/vendor/html5lib/treewalkers/__init__.py
index 3a606a8..18124e7 100644
--- a/planet/vendor/html5lib/treewalkers/__init__.py
+++ b/planet/vendor/html5lib/treewalkers/__init__.py
@@ -8,23 +8,27 @@
 returning an iterator generating tokens.
 """
 
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+
+from ..utils import default_etree
+
 treeWalkerCache = {}
 
+
 def getTreeWalker(treeType, implementation=None, **kwargs):
     """Get a TreeWalker class for various types of tree with built-in support
 
     treeType - the name of the tree type required (case-insensitive). Supported
-               values are "simpletree", "dom", "etree" and "beautifulsoup"
+               values are:
 
-               "simpletree" - a built-in DOM-ish tree type with support for some
-                              more pythonic idioms.
                 "dom" - The xml.dom.minidom DOM implementation
                 "pulldom" - The xml.dom.pulldom event stream
                 "etree" - A generic walker for tree implementations exposing an
                           elementtree-like interface (known to work with
                           ElementTree, cElementTree and lxml.etree).
                 "lxml" - Optimized walker for lxml.etree
-                "beautifulsoup" - Beautiful soup (if installed)
                 "genshi" - a Genshi stream
 
     implementation - (Currently applies to the "etree" tree type only). A module
@@ -33,20 +37,21 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
 
     treeType = treeType.lower()
     if treeType not in treeWalkerCache:
-        if treeType in ("dom", "pulldom", "simpletree"):
-            mod = __import__(treeType, globals())
+        if treeType in ("dom", "pulldom"):
+            name = "%s.%s" % (__name__, treeType)
+            __import__(name)
+            mod = sys.modules[name]
             treeWalkerCache[treeType] = mod.TreeWalker
         elif treeType == "genshi":
-            import genshistream
+            from . import genshistream
             treeWalkerCache[treeType] = genshistream.TreeWalker
-        elif treeType == "beautifulsoup":
-            import soup
-            treeWalkerCache[treeType] = soup.TreeWalker
         elif treeType == "lxml":
-            import lxmletree
+            from . import lxmletree
             treeWalkerCache[treeType] = lxmletree.TreeWalker
         elif treeType == "etree":
-            import etree
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
             # XXX: NEVER cache here, caching is done in the etree submodule
             return etree.getETreeModule(implementation, **kwargs).TreeWalker
     return treeWalkerCache.get(treeType)
diff --git a/planet/vendor/html5lib/treewalkers/_base.py b/planet/vendor/html5lib/treewalkers/_base.py
index 4128be1..34252e5 100644
--- a/planet/vendor/html5lib/treewalkers/_base.py
+++ b/planet/vendor/html5lib/treewalkers/_base.py
@@ -1,8 +1,40 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type, string_types
+
 import gettext
 _ = gettext.gettext
 
-from html5lib.constants import voidElements, spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+from xml.dom import Node
+
+DOCUMENT = Node.DOCUMENT_NODE
+DOCTYPE = Node.DOCUMENT_TYPE_NODE
+TEXT = Node.TEXT_NODE
+ELEMENT = Node.ELEMENT_NODE
+COMMENT = Node.COMMENT_NODE
+ENTITY = Node.ENTITY_NODE
+UNKNOWN = "<#UNKNOWN#>"
+
+from ..constants import voidElements, spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+
+def to_text(s, blank_if_none=True):
+    """Wrapper around six.text_type to convert None to empty string"""
+    if s is None:
+        if blank_if_none:
+            return ""
+        else:
+            return None
+    elif isinstance(s, text_type):
+        return s
+    else:
+        return text_type(s)
+
+
+def is_text_or_none(string):
+    """Wrapper around isinstance(string_types) or is None"""
+    return string is None or isinstance(string, string_types)
+
 
 class TreeWalker(object):
     def __init__(self, tree):
@@ -14,36 +46,50 @@ def __iter__(self):
     def error(self, msg):
         return {"type": "SerializeError", "data": msg}
 
-    def normalizeAttrs(self, attrs):
-        if not attrs:
-            attrs = []
-        elif hasattr(attrs, 'items'):
-            attrs = attrs.items()
-        return [(unicode(name),unicode(value)) for name,value in attrs]
-
     def emptyTag(self, namespace, name, attrs, hasChildren=False):
-        yield {"type": "EmptyTag", "name": unicode(name), 
-               "namespace":unicode(namespace),
-               "data": self.normalizeAttrs(attrs)}
+        assert namespace is None or isinstance(namespace, string_types), type(namespace)
+        assert isinstance(name, string_types), type(name)
+        assert all((namespace is None or isinstance(namespace, string_types)) and
+                   isinstance(name, string_types) and
+                   isinstance(value, string_types)
+                   for (namespace, name), value in attrs.items())
+
+        yield {"type": "EmptyTag", "name": to_text(name, False),
+               "namespace": to_text(namespace),
+               "data": attrs}
         if hasChildren:
             yield self.error(_("Void element has children"))
 
     def startTag(self, namespace, name, attrs):
-        return {"type": "StartTag", 
-                "name": unicode(name),
-                "namespace":unicode(namespace),
-                "data": self.normalizeAttrs(attrs)}
+        assert namespace is None or isinstance(namespace, string_types), type(namespace)
+        assert isinstance(name, string_types), type(name)
+        assert all((namespace is None or isinstance(namespace, string_types)) and
+                   isinstance(name, string_types) and
+                   isinstance(value, string_types)
+                   for (namespace, name), value in attrs.items())
+
+        return {"type": "StartTag",
+                "name": text_type(name),
+                "namespace": to_text(namespace),
+                "data": dict(((to_text(namespace, False), to_text(name)),
+                              to_text(value, False))
+                             for (namespace, name), value in attrs.items())}
 
     def endTag(self, namespace, name):
-        return {"type": "EndTag", 
-                "name": unicode(name),
-                "namespace":unicode(namespace),
-                "data": []}
+        assert namespace is None or isinstance(namespace, string_types), type(namespace)
+        assert isinstance(name, string_types), type(namespace)
+
+        return {"type": "EndTag",
+                "name": to_text(name, False),
+                "namespace": to_text(namespace),
+                "data": {}}
 
     def text(self, data):
-        data = unicode(data)
+        assert isinstance(data, string_types), type(data)
+
+        data = to_text(data)
         middle = data.lstrip(spaceCharacters)
-        left = data[:len(data)-len(middle)]
+        left = data[:len(data) - len(middle)]
         if left:
             yield {"type": "SpaceCharacters", "data": left}
         data = middle
@@ -55,56 +101,40 @@ def text(self, data):
             yield {"type": "SpaceCharacters", "data": right}
 
     def comment(self, data):
-        return {"type": "Comment", "data": unicode(data)}
+        assert isinstance(data, string_types), type(data)
+
+        return {"type": "Comment", "data": text_type(data)}
 
     def doctype(self, name, publicId=None, systemId=None, correct=True):
+        assert is_text_or_none(name), type(name)
+        assert is_text_or_none(publicId), type(publicId)
+        assert is_text_or_none(systemId), type(systemId)
+
         return {"type": "Doctype",
-                "name": name is not None and unicode(name) or u"",
-                "publicId": publicId,
-                "systemId": systemId,
-                "correct": correct}
+                "name": to_text(name),
+                "publicId": to_text(publicId),
+                "systemId": to_text(systemId),
+                "correct": to_text(correct)}
 
     def entity(self, name):
-        return {"type": "Entity", "name": unicode(name)}
+        assert isinstance(name, string_types), type(name)
+
+        return {"type": "Entity", "name": text_type(name)}
 
     def unknown(self, nodeType):
         return self.error(_("Unknown node type: ") + nodeType)
 
-class RecursiveTreeWalker(TreeWalker):
-    def walkChildren(self, node):
-        raise NodeImplementedError
-
-    def element(self, node, namespace, name, attrs, hasChildren):
-        if name in voidElements:
-            for token in self.emptyTag(namespace, name, attrs, hasChildren):
-                yield token
-        else:
-            yield self.startTag(name, attrs)
-            if hasChildren:
-                for token in self.walkChildren(node):
-                    yield token
-            yield self.endTag(name)
-
-from xml.dom import Node
-
-DOCUMENT = Node.DOCUMENT_NODE
-DOCTYPE = Node.DOCUMENT_TYPE_NODE
-TEXT = Node.TEXT_NODE
-ELEMENT = Node.ELEMENT_NODE
-COMMENT = Node.COMMENT_NODE
-ENTITY = Node.ENTITY_NODE
-UNKNOWN = "<#UNKNOWN#>"
 
 class NonRecursiveTreeWalker(TreeWalker):
     def getNodeDetails(self, node):
         raise NotImplementedError
-    
+
     def getFirstChild(self, node):
         raise NotImplementedError
-    
+
     def getNextSibling(self, node):
         raise NotImplementedError
-    
+
     def getParentNode(self, node):
         raise NotImplementedError
 
@@ -114,7 +144,6 @@ def __iter__(self):
             details = self.getNodeDetails(currentNode)
             type, details = details[0], details[1:]
             hasChildren = False
-            endTag = None
 
             if type == DOCTYPE:
                 yield self.doctype(*details)
@@ -126,12 +155,11 @@ def __iter__(self):
             elif type == ELEMENT:
                 namespace, name, attributes, hasChildren = details
                 if name in voidElements:
-                    for token in self.emptyTag(namespace, name, attributes, 
+                    for token in self.emptyTag(namespace, name, attributes,
                                                hasChildren):
                         yield token
                     hasChildren = False
                 else:
-                    endTag = name
                     yield self.startTag(namespace, name, attributes)
 
             elif type == COMMENT:
@@ -145,12 +173,12 @@ def __iter__(self):
 
             else:
                 yield self.unknown(details[0])
-            
+
             if hasChildren:
                 firstChild = self.getFirstChild(currentNode)
             else:
                 firstChild = None
-            
+
             if firstChild is not None:
                 currentNode = firstChild
             else:
diff --git a/planet/vendor/html5lib/treewalkers/dom.py b/planet/vendor/html5lib/treewalkers/dom.py
index 0adc77f..a01287a 100644
--- a/planet/vendor/html5lib/treewalkers/dom.py
+++ b/planet/vendor/html5lib/treewalkers/dom.py
@@ -1,10 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
 from xml.dom import Node
 
 import gettext
 _ = gettext.gettext
 
-import _base
-from html5lib.constants import voidElements
+from . import _base
+
 
 class TreeWalker(_base.NonRecursiveTreeWalker):
     def getNodeDetails(self, node):
@@ -15,8 +17,15 @@ def getNodeDetails(self, node):
             return _base.TEXT, node.nodeValue
 
         elif node.nodeType == Node.ELEMENT_NODE:
-            return (_base.ELEMENT, node.namespaceURI, node.nodeName, 
-                    node.attributes.items(), node.hasChildNodes)
+            attrs = {}
+            for attr in list(node.attributes.keys()):
+                attr = node.getAttributeNode(attr)
+                if attr.namespaceURI:
+                    attrs[(attr.namespaceURI, attr.localName)] = attr.value
+                else:
+                    attrs[(None, attr.name)] = attr.value
+            return (_base.ELEMENT, node.namespaceURI, node.nodeName,
+                    attrs, node.hasChildNodes())
 
         elif node.nodeType == Node.COMMENT_NODE:
             return _base.COMMENT, node.nodeValue
diff --git a/planet/vendor/html5lib/treewalkers/etree.py b/planet/vendor/html5lib/treewalkers/etree.py
index 739d307..fd8a9cc 100644
--- a/planet/vendor/html5lib/treewalkers/etree.py
+++ b/planet/vendor/html5lib/treewalkers/etree.py
@@ -1,30 +1,28 @@
+from __future__ import absolute_import, division, unicode_literals
+
+try:
+    from collections import OrderedDict
+except ImportError:
+    try:
+        from ordereddict import OrderedDict
+    except ImportError:
+        OrderedDict = dict
 import gettext
 _ = gettext.gettext
 
-import new
-import copy
 import re
 
-import _base
-from html5lib.constants import voidElements
+from six import text_type
 
-tag_regexp = re.compile("{([^}]*)}(.*)")
+from . import _base
+from ..utils import moduleFactoryFactory
 
-moduleCache = {}
+tag_regexp = re.compile("{([^}]*)}(.*)")
 
-def getETreeModule(ElementTreeImplementation):
-    name = "_" + ElementTreeImplementation.__name__+"builder"
-    if name in moduleCache:
-        return moduleCache[name]
-    else:
-        mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
-        objs = getETreeBuilder(ElementTreeImplementation)
-        mod.__dict__.update(objs)
-        moduleCache[name] = mod
-        return mod
 
 def getETreeBuilder(ElementTreeImplementation):
     ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
 
     class TreeWalker(_base.NonRecursiveTreeWalker):
         """Given the particular ElementTree representation, this implementation,
@@ -32,16 +30,16 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
         content:
 
         1. The current element
-        
+
         2. The index of the element relative to its parent
-        
+
         3. A stack of ancestor elements
-        
+
         4. A flag "text", "tail" or None to indicate if the current node is a
            text node; either the text or tail of the current element (1)
         """
         def getNodeDetails(self, node):
-            if isinstance(node, tuple): # It might be the root Element
+            if isinstance(node, tuple):  # It might be the root Element
                 elt, key, parents, flag = node
                 if flag in ("text", "tail"):
                     return _base.TEXT, getattr(elt, flag)
@@ -51,33 +49,41 @@ def getNodeDetails(self, node):
             if not(hasattr(node, "tag")):
                 node = node.getroot()
 
-            if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
+            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
                 return (_base.DOCUMENT,)
 
             elif node.tag == "<!DOCTYPE>":
-                return (_base.DOCTYPE, node.text, 
+                return (_base.DOCTYPE, node.text,
                         node.get("publicId"), node.get("systemId"))
 
-            elif type(node.tag) == type(ElementTree.Comment):
+            elif node.tag == ElementTreeCommentType:
                 return _base.COMMENT, node.text
 
             else:
-                #This is assumed to be an ordinary element
+                assert type(node.tag) == text_type, type(node.tag)
+                # This is assumed to be an ordinary element
                 match = tag_regexp.match(node.tag)
                 if match:
                     namespace, tag = match.groups()
                 else:
                     namespace = None
                     tag = node.tag
-                return (_base.ELEMENT, namespace, tag, 
-                        node.attrib.items(), len(node) or node.text)
-    
+                attrs = OrderedDict()
+                for name, value in list(node.attrib.items()):
+                    match = tag_regexp.match(name)
+                    if match:
+                        attrs[(match.group(1), match.group(2))] = value
+                    else:
+                        attrs[(None, name)] = value
+                return (_base.ELEMENT, namespace, tag,
+                        attrs, len(node) or node.text)
+
         def getFirstChild(self, node):
             if isinstance(node, tuple):
                 element, key, parents, flag = node
             else:
                 element, key, parents, flag = node, None, [], None
-                
+
             if flag in ("text", "tail"):
                 return None
             else:
@@ -88,13 +94,13 @@ def getFirstChild(self, node):
                     return element[0], 0, parents, None
                 else:
                     return None
-        
+
         def getNextSibling(self, node):
             if isinstance(node, tuple):
                 element, key, parents, flag = node
             else:
                 return None
-                
+
             if flag == "text":
                 if len(element):
                     parents.append(element)
@@ -105,16 +111,16 @@ def getNextSibling(self, node):
                 if element.tail and flag != "tail":
                     return element, key, parents, "tail"
                 elif key < len(parents[-1]) - 1:
-                    return parents[-1][key+1], key+1, parents, None
+                    return parents[-1][key + 1], key + 1, parents, None
                 else:
                     return None
-        
+
         def getParentNode(self, node):
             if isinstance(node, tuple):
                 element, key, parents, flag = node
             else:
                 return None
-            
+
             if flag == "text":
                 if not parents:
                     return element
@@ -128,3 +134,5 @@ def getParentNode(self, node):
                     return parent, list(parents[-1]).index(parent), parents, None
 
     return locals()
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
diff --git a/planet/vendor/html5lib/treewalkers/genshistream.py b/planet/vendor/html5lib/treewalkers/genshistream.py
index ef71a83..f559c45 100644
--- a/planet/vendor/html5lib/treewalkers/genshistream.py
+++ b/planet/vendor/html5lib/treewalkers/genshistream.py
@@ -1,50 +1,49 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from genshi.core import QName
 from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
-from genshi.core  import  START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
-from genshi.output import NamespaceFlattener
+from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
+
+from . import _base
 
-import _base
+from ..constants import voidElements, namespaces
 
-from html5lib.constants import voidElements
 
 class TreeWalker(_base.TreeWalker):
     def __iter__(self):
-        depth = 0
-        ignore_until = None
+        # Buffer the events so we can pass in the following one
         previous = None
         for event in self.tree:
             if previous is not None:
-                if previous[0] == START:
-                    depth += 1
-                if ignore_until <= depth:
-                    ignore_until = None
-                if ignore_until is None:
-                    for token in self.tokens(previous, event):
-                        yield token
-                        if token["type"] == "EmptyTag":
-                            ignore_until = depth
-                if previous[0] == END:
-                    depth -= 1
+                for token in self.tokens(previous, event):
+                    yield token
             previous = event
+
+        # Don't forget the final event!
         if previous is not None:
-            if ignore_until is None or ignore_until <= depth:
-                for token in self.tokens(previous, None):
-                    yield token
-            elif ignore_until is not None:
-                raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
+            for token in self.tokens(previous, None):
+                yield token
 
     def tokens(self, event, next):
         kind, data, pos = event
         if kind == START:
-            tag, attrib = data
+            tag, attribs = data
             name = tag.localname
             namespace = tag.namespace
-            if tag in voidElements:
-                for token in self.emptyTag(namespace, name, list(attrib),
-                                           not next or next[0] != END 
+            converted_attribs = {}
+            for k, v in attribs:
+                if isinstance(k, QName):
+                    converted_attribs[(k.namespace, k.localname)] = v
+                else:
+                    converted_attribs[(None, k)] = v
+
+            if namespace == namespaces["html"] and name in voidElements:
+                for token in self.emptyTag(namespace, name, converted_attribs,
+                                           not next or next[0] != END
                                            or next[1] != tag):
                     yield token
             else:
-                yield self.startTag(namespace, name, list(attrib))
+                yield self.startTag(namespace, name, converted_attribs)
 
         elif kind == END:
             name = data.localname
@@ -62,8 +61,8 @@ def tokens(self, event, next):
         elif kind == DOCTYPE:
             yield self.doctype(*data)
 
-        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
-          START_CDATA, END_CDATA, PI):
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
+                      START_CDATA, END_CDATA, PI):
             pass
 
         else:
diff --git a/planet/vendor/html5lib/treewalkers/lxmletree.py b/planet/vendor/html5lib/treewalkers/lxmletree.py
index 2c38aff..bc934ac 100644
--- a/planet/vendor/html5lib/treewalkers/lxmletree.py
+++ b/planet/vendor/html5lib/treewalkers/lxmletree.py
@@ -1,181 +1,204 @@
-from lxml import etree
-from html5lib.treebuilders.etree import tag_regexp
-
-from gettext import gettext
-_ = gettext
-
-import _base
-
-from html5lib.constants import voidElements
-from html5lib import ihatexml
-
-class Root(object):
-    def __init__(self, et):
-        self.elementtree = et
-        self.children = []
-        if et.docinfo.internalDTD:
-            self.children.append(Doctype(self, et.docinfo.root_name, 
-                                         et.docinfo.public_id, 
-                                         et.docinfo.system_url))
-        root = et.getroot()
-        node = root
-
-        while node.getprevious() is not None:
-            node = node.getprevious()
-        while node is not None:
-            self.children.append(node)
-            node = node.getnext()
-
-        self.text = None
-        self.tail = None
-    
-    def __getitem__(self, key):
-        return self.children[key]
-
-    def getnext(self):
-        return None
-
-    def __len__(self):
-        return 1
-
-class Doctype(object):
-    def __init__(self, root_node, name, public_id, system_id):
-        self.root_node = root_node
-        self.name = name
-        self.public_id = public_id
-        self.system_id = system_id
-        
-        self.text = None
-        self.tail = None
-
-    def getnext(self):
-        return self.root_node.children[1]
-
-class FragmentRoot(Root):
-    def __init__(self, children):
-        self.children = [FragmentWrapper(self, child) for child in children]
-        self.text = self.tail = None
-
-    def getnext(self):
-        return None
-
-class FragmentWrapper(object):
-    def __init__(self, fragment_root, obj):
-        self.root_node = fragment_root
-        self.obj = obj
-        if hasattr(self.obj, 'text'):
-            self.text = self.obj.text
-        else:
-            self.text = None
-        if hasattr(self.obj, 'tail'):
-            self.tail = self.obj.tail
-        else:
-            self.tail = None
-        self.isstring = isinstance(obj, basestring)
-        
-    def __getattr__(self, name):
-        return getattr(self.obj, name)
-    
-    def getnext(self):
-        siblings = self.root_node.children
-        idx = siblings.index(self)
-        if idx < len(siblings) - 1:
-            return siblings[idx + 1]
-        else:
-            return None
-
-    def __getitem__(self, key):
-        return self.obj[key]
-
-    def __nonzero__(self):
-        return bool(self.obj)
-
-    def getparent(self):
-        return None
-
-    def __str__(self):
-        return str(self.obj)
-
-    def __unicode__(self):
-        return unicode(self.obj)
-
-    def __len__(self):
-        return len(self.obj)
-
-        
-class TreeWalker(_base.NonRecursiveTreeWalker):
-    def __init__(self, tree):
-        if hasattr(tree, "getroot"):
-            tree = Root(tree)
-        elif isinstance(tree, list):
-            tree = FragmentRoot(tree)
-        _base.NonRecursiveTreeWalker.__init__(self, tree)
-        self.filter = ihatexml.InfosetFilter()
-    def getNodeDetails(self, node):
-        if isinstance(node, tuple): # Text node
-            node, key = node
-            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
-            return _base.TEXT, getattr(node, key)
-
-        elif isinstance(node, Root):
-            return (_base.DOCUMENT,)
-
-        elif isinstance(node, Doctype):
-            return _base.DOCTYPE, node.name, node.public_id, node.system_id
-
-        elif isinstance(node, FragmentWrapper) and node.isstring:
-            return _base.TEXT, node
-
-        elif node.tag == etree.Comment:
-            return _base.COMMENT, node.text
-
-        elif node.tag == etree.Entity:
-            return _base.ENTITY, node.text[1:-1] # strip &;
-
-        else:
-            #This is assumed to be an ordinary element
-            match = tag_regexp.match(node.tag)
-            if match:
-                namespace, tag = match.groups()
-            else:
-                namespace = None
-                tag = node.tag
-            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), 
-                    [(self.filter.fromXmlName(name), value) for 
-                     name,value in node.attrib.iteritems()], 
-                     len(node) > 0 or node.text)
-
-    def getFirstChild(self, node):
-        assert not isinstance(node, tuple), _("Text nodes have no children")
-
-        assert len(node) or node.text, "Node has no children"
-        if node.text:
-            return (node, "text")
-        else:
-            return node[0]
-
-    def getNextSibling(self, node):
-        if isinstance(node, tuple): # Text node
-            node, key = node
-            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
-            if key == "text":
-                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
-                # because node[0] might evaluate to False if it has no child element
-                if len(node):
-                    return node[0]
-                else:
-                    return None
-            else: # tail
-                return node.getnext()
-
-        return node.tail and (node, "tail") or node.getnext()
-
-    def getParentNode(self, node):
-        if isinstance(node, tuple): # Text node
-            node, key = node
-            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
-            if key == "text":
-                return node
-            # else: fallback to "normal" processing
-
-        return node.getparent()
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from lxml import etree
+from ..treebuilders.etree import tag_regexp
+
+from gettext import gettext
+_ = gettext
+
+from . import _base
+
+from .. import ihatexml
+
+
+def ensure_str(s):
+    if s is None:
+        return None
+    elif isinstance(s, text_type):
+        return s
+    else:
+        return s.decode("utf-8", "strict")
+
+
+class Root(object):
+    def __init__(self, et):
+        self.elementtree = et
+        self.children = []
+        if et.docinfo.internalDTD:
+            self.children.append(Doctype(self,
+                                         ensure_str(et.docinfo.root_name),
+                                         ensure_str(et.docinfo.public_id),
+                                         ensure_str(et.docinfo.system_url)))
+        root = et.getroot()
+        node = root
+
+        while node.getprevious() is not None:
+            node = node.getprevious()
+        while node is not None:
+            self.children.append(node)
+            node = node.getnext()
+
+        self.text = None
+        self.tail = None
+
+    def __getitem__(self, key):
+        return self.children[key]
+
+    def getnext(self):
+        return None
+
+    def __len__(self):
+        return 1
+
+
+class Doctype(object):
+    def __init__(self, root_node, name, public_id, system_id):
+        self.root_node = root_node
+        self.name = name
+        self.public_id = public_id
+        self.system_id = system_id
+
+        self.text = None
+        self.tail = None
+
+    def getnext(self):
+        return self.root_node.children[1]
+
+
+class FragmentRoot(Root):
+    def __init__(self, children):
+        self.children = [FragmentWrapper(self, child) for child in children]
+        self.text = self.tail = None
+
+    def getnext(self):
+        return None
+
+
+class FragmentWrapper(object):
+    def __init__(self, fragment_root, obj):
+        self.root_node = fragment_root
+        self.obj = obj
+        if hasattr(self.obj, 'text'):
+            self.text = ensure_str(self.obj.text)
+        else:
+            self.text = None
+        if hasattr(self.obj, 'tail'):
+            self.tail = ensure_str(self.obj.tail)
+        else:
+            self.tail = None
+
+    def __getattr__(self, name):
+        return getattr(self.obj, name)
+
+    def getnext(self):
+        siblings = self.root_node.children
+        idx = siblings.index(self)
+        if idx < len(siblings) - 1:
+            return siblings[idx + 1]
+        else:
+            return None
+
+    def __getitem__(self, key):
+        return self.obj[key]
+
+    def __bool__(self):
+        return bool(self.obj)
+
+    def getparent(self):
+        return None
+
+    def __str__(self):
+        return str(self.obj)
+
+    def __unicode__(self):
+        return str(self.obj)
+
+    def __len__(self):
+        return len(self.obj)
+
+
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    def __init__(self, tree):
+        if hasattr(tree, "getroot"):
+            tree = Root(tree)
+        elif isinstance(tree, list):
+            tree = FragmentRoot(tree)
+        _base.NonRecursiveTreeWalker.__init__(self, tree)
+        self.filter = ihatexml.InfosetFilter()
+
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            return _base.TEXT, ensure_str(getattr(node, key))
+
+        elif isinstance(node, Root):
+            return (_base.DOCUMENT,)
+
+        elif isinstance(node, Doctype):
+            return _base.DOCTYPE, node.name, node.public_id, node.system_id
+
+        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
+            return _base.TEXT, node.obj
+
+        elif node.tag == etree.Comment:
+            return _base.COMMENT, ensure_str(node.text)
+
+        elif node.tag == etree.Entity:
+            return _base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
+
+        else:
+            # This is assumed to be an ordinary element
+            match = tag_regexp.match(ensure_str(node.tag))
+            if match:
+                namespace, tag = match.groups()
+            else:
+                namespace = None
+                tag = ensure_str(node.tag)
+            attrs = {}
+            for name, value in list(node.attrib.items()):
+                name = ensure_str(name)
+                value = ensure_str(value)
+                match = tag_regexp.match(name)
+                if match:
+                    attrs[(match.group(1), match.group(2))] = value
+                else:
+                    attrs[(None, name)] = value
+            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
+                    attrs, len(node) > 0 or node.text)
+
+    def getFirstChild(self, node):
+        assert not isinstance(node, tuple), _("Text nodes have no children")
+
+        assert len(node) or node.text, "Node has no children"
+        if node.text:
+            return (node, "text")
+        else:
+            return node[0]
+
+    def getNextSibling(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            if key == "text":
+                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
+                # because node[0] might evaluate to False if it has no child element
+                if len(node):
+                    return node[0]
+                else:
+                    return None
+            else:  # tail
+                return node.getnext()
+
+        return (node, "tail") if node.tail else node.getnext()
+
+    def getParentNode(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            if key == "text":
+                return node
+            # else: fallback to "normal" processing
+
+        return node.getparent()
diff --git a/planet/vendor/html5lib/treewalkers/pulldom.py b/planet/vendor/html5lib/treewalkers/pulldom.py
index 7354a0e..0b0f515 100644
--- a/planet/vendor/html5lib/treewalkers/pulldom.py
+++ b/planet/vendor/html5lib/treewalkers/pulldom.py
@@ -1,9 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
 from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
     COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
 
-import _base
+from . import _base
+
+from ..constants import voidElements
 
-from html5lib.constants import voidElements
 
 class TreeWalker(_base.TreeWalker):
     def __iter__(self):
@@ -11,7 +14,7 @@ def __iter__(self):
         previous = None
         for event in self.tree:
             if previous is not None and \
-              (ignore_until is None or previous[1] is ignore_until):
+                    (ignore_until is None or previous[1] is ignore_until):
                 if previous[1] is ignore_until:
                     ignore_until = None
                 for token in self.tokens(previous, event):
@@ -30,14 +33,18 @@ def tokens(self, event, next):
         if type == START_ELEMENT:
             name = node.nodeName
             namespace = node.namespaceURI
+            attrs = {}
+            for attr in list(node.attributes.keys()):
+                attr = node.getAttributeNode(attr)
+                attrs[(attr.namespaceURI, attr.localName)] = attr.value
             if name in voidElements:
                 for token in self.emptyTag(namespace,
                                            name,
-                                           node.attributes.items(), 
+                                           attrs,
                                            not next or next[1] is not node):
                     yield token
             else:
-                yield self.startTag(namespace, name, node.attributes.items())
+                yield self.startTag(namespace, name, attrs)
 
         elif type == END_ELEMENT:
             name = node.nodeName
diff --git a/planet/vendor/html5lib/treewalkers/simpletree.py b/planet/vendor/html5lib/treewalkers/simpletree.py
deleted file mode 100644
index 42be2a2..0000000
--- a/planet/vendor/html5lib/treewalkers/simpletree.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import gettext
-_ = gettext.gettext
-
-import _base
-
-class TreeWalker(_base.NonRecursiveTreeWalker):
-    """Given that simpletree has no performant way of getting a node's
-    next sibling, this implementation returns "nodes" as tuples with the
-    following content:
-
-    1. The parent Node (Element, Document or DocumentFragment)
-
-    2. The child index of the current node in its parent's children list
-
-    3. A list used as a stack of all ancestors. It is a pair tuple whose
-       first item is a parent Node and second item is a child index.
-    """
-
-    def getNodeDetails(self, node):
-        if isinstance(node, tuple): # It might be the root Node
-            parent, idx, parents = node
-            node = parent.childNodes[idx]
-
-        # testing node.type allows us not to import treebuilders.simpletree
-        if node.type in (1, 2): # Document or DocumentFragment
-            return (_base.DOCUMENT,)
-
-        elif node.type == 3: # DocumentType
-            return _base.DOCTYPE, node.name, node.publicId, node.systemId
-
-        elif node.type == 4: # TextNode
-            return _base.TEXT, node.value
-
-        elif node.type == 5: # Element
-            return (_base.ELEMENT, node.namespace, node.name, 
-                    node.attributes.items(), node.hasContent())
-
-        elif node.type == 6: # CommentNode
-            return _base.COMMENT, node.data
-
-        else:
-            return _node.UNKNOWN, node.type
-
-    def getFirstChild(self, node):
-        if isinstance(node, tuple): # It might be the root Node
-            parent, idx, parents = node
-            parents.append((parent, idx))
-            node = parent.childNodes[idx]
-        else:
-            parents = []
-
-        assert node.hasContent(), "Node has no children"
-        return (node, 0, parents)
-
-    def getNextSibling(self, node):
-        assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
-        parent, idx, parents = node
-        idx += 1
-        if len(parent.childNodes) > idx:
-            return (parent, idx, parents)
-        else:
-            return None
-
-    def getParentNode(self, node):
-        assert isinstance(node, tuple)
-        parent, idx, parents = node
-        if parents:
-            parent, idx = parents.pop()
-            return parent, idx, parents
-        else:
-            # HACK: We could return ``parent`` but None will stop the algorithm the same way
-            return None
diff --git a/planet/vendor/html5lib/treewalkers/soup.py b/planet/vendor/html5lib/treewalkers/soup.py
deleted file mode 100644
index fca65ec..0000000
--- a/planet/vendor/html5lib/treewalkers/soup.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import re
-import gettext
-_ = gettext.gettext
-
-from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
-from html5lib.constants import namespaces
-import _base
-
-class TreeWalker(_base.NonRecursiveTreeWalker):
-    doctype_regexp = re.compile(
-        r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
-    def getNodeDetails(self, node):
-        if isinstance(node, BeautifulSoup): # Document or DocumentFragment
-            return (_base.DOCUMENT,)
-
-        elif isinstance(node, Declaration): # DocumentType
-            string = unicode(node.string)
-            #Slice needed to remove markup added during unicode conversion,
-            #but only in some versions of BeautifulSoup/Python
-            if string.startswith('<!') and string.endswith('>'):
-                string = string[2:-1]
-            m = self.doctype_regexp.match(string)
-            #This regexp approach seems wrong and fragile
-            #but beautiful soup stores the doctype as a single thing and we want the seperate bits
-            #It should work as long as the tree is created by html5lib itself but may be wrong if it's
-            #been modified at all
-            #We could just feed to it a html5lib tokenizer, I guess...
-            assert m is not None, "DOCTYPE did not match expected format"
-
-            name = m.group('name')
-            publicId = m.group('publicId')
-            if publicId is not None:
-                systemId = m.group('systemId1')
-            else:
-                systemId = m.group('systemId2')
-            return _base.DOCTYPE, name, publicId or "", systemId or ""
-
-        elif isinstance(node, Comment):
-            string = unicode(node.string)
-            if string.startswith('<!--') and string.endswith('-->'):
-                string = string[4:-3]
-            return _base.COMMENT, string
-
-        elif isinstance(node, unicode): # TextNode
-            return _base.TEXT, node
-
-        elif isinstance(node, Tag): # Element
-            return (_base.ELEMENT, namespaces["html"], node.name,
-                    dict(node.attrs).items(), node.contents)
-        else:
-            return _base.UNKNOWN, node.__class__.__name__
-
-    def getFirstChild(self, node):
-        return node.contents[0]
-
-    def getNextSibling(self, node):
-        return node.nextSibling
-
-    def getParentNode(self, node):
-        return node.parent
diff --git a/planet/vendor/html5lib/trie/__init__.py b/planet/vendor/html5lib/trie/__init__.py
new file mode 100644
index 0000000..a8cca8a
--- /dev/null
+++ b/planet/vendor/html5lib/trie/__init__.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from .py import Trie as PyTrie
+
+Trie = PyTrie
+
+try:
+    from .datrie import Trie as DATrie
+except ImportError:
+    pass
+else:
+    Trie = DATrie
diff --git a/planet/vendor/html5lib/trie/_base.py b/planet/vendor/html5lib/trie/_base.py
new file mode 100644
index 0000000..724486b
--- /dev/null
+++ b/planet/vendor/html5lib/trie/_base.py
@@ -0,0 +1,37 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from collections import Mapping
+
+
+class Trie(Mapping):
+    """Abstract base class for tries"""
+
+    def keys(self, prefix=None):
+        keys = super().keys()
+
+        if prefix is None:
+            return set(keys)
+
+        # Python 2.6: no set comprehensions
+        return set([x for x in keys if x.startswith(prefix)])
+
+    def has_keys_with_prefix(self, prefix):
+        for key in self.keys():
+            if key.startswith(prefix):
+                return True
+
+        return False
+
+    def longest_prefix(self, prefix):
+        if prefix in self:
+            return prefix
+
+        for i in range(1, len(prefix) + 1):
+            if prefix[:-i] in self:
+                return prefix[:-i]
+
+        raise KeyError(prefix)
+
+    def longest_prefix_item(self, prefix):
+        lprefix = self.longest_prefix(prefix)
+        return (lprefix, self[lprefix])
diff --git a/planet/vendor/html5lib/trie/datrie.py b/planet/vendor/html5lib/trie/datrie.py
new file mode 100644
index 0000000..51f3d04
--- /dev/null
+++ b/planet/vendor/html5lib/trie/datrie.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from datrie import Trie as DATrie
+from six import text_type
+
+from ._base import Trie as ABCTrie
+
+
+class Trie(ABCTrie):
+    def __init__(self, data):
+        chars = set()
+        for key in data.keys():
+            if not isinstance(key, text_type):
+                raise TypeError("All keys must be strings")
+            for char in key:
+                chars.add(char)
+
+        self._data = DATrie("".join(chars))
+        for key, value in data.items():
+            self._data[key] = value
+
+    def __contains__(self, key):
+        return key in self._data
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        raise NotImplementedError()
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def keys(self, prefix=None):
+        return self._data.keys(prefix)
+
+    def has_keys_with_prefix(self, prefix):
+        return self._data.has_keys_with_prefix(prefix)
+
+    def longest_prefix(self, prefix):
+        return self._data.longest_prefix(prefix)
+
+    def longest_prefix_item(self, prefix):
+        return self._data.longest_prefix_item(prefix)
diff --git a/planet/vendor/html5lib/trie/py.py b/planet/vendor/html5lib/trie/py.py
new file mode 100644
index 0000000..c2ba3da
--- /dev/null
+++ b/planet/vendor/html5lib/trie/py.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from bisect import bisect_left
+
+from ._base import Trie as ABCTrie
+
+
+class Trie(ABCTrie):
+    def __init__(self, data):
+        if not all(isinstance(x, text_type) for x in data.keys()):
+            raise TypeError("All keys must be strings")
+
+        self._data = data
+        self._keys = sorted(data.keys())
+        self._cachestr = ""
+        self._cachepoints = (0, len(data))
+
+    def __contains__(self, key):
+        return key in self._data
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        return iter(self._data)
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def keys(self, prefix=None):
+        if prefix is None or prefix == "" or not self._keys:
+            return set(self._keys)
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            start = i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            start = i = bisect_left(self._keys, prefix)
+
+        keys = set()
+        if start == len(self._keys):
+            return keys
+
+        while self._keys[i].startswith(prefix):
+            keys.add(self._keys[i])
+            i += 1
+
+        self._cachestr = prefix
+        self._cachepoints = (start, i)
+
+        return keys
+
+    def has_keys_with_prefix(self, prefix):
+        if prefix in self._data:
+            return True
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            i = bisect_left(self._keys, prefix)
+
+        if i == len(self._keys):
+            return False
+
+        return self._keys[i].startswith(prefix)
diff --git a/planet/vendor/html5lib/utils.py b/planet/vendor/html5lib/utils.py
index d53f678..2f41f4d 100644
--- a/planet/vendor/html5lib/utils.py
+++ b/planet/vendor/html5lib/utils.py
@@ -1,9 +1,16 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from types import ModuleType
+
 try:
-    frozenset
-except NameError:
-    #Import from the sets module for python 2.3
-    from sets import Set as set
-    from sets import ImmutableSet as frozenset
+    import xml.etree.cElementTree as default_etree
+except ImportError:
+    import xml.etree.ElementTree as default_etree
+
+
+__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
+           "surrogatePairToCodepoint", "moduleFactoryFactory"]
+
 
 class MethodDispatcher(dict):
     """Dict with 2 special properties:
@@ -23,7 +30,7 @@ def __init__(self, items=()):
         # twice as fast. Please do careful performance testing before changing
         # anything here.
         _dictEntries = []
-        for name,value in items:
+        for name, value in items:
             if type(name) in (list, tuple, frozenset, set):
                 for item in name:
                     _dictEntries.append((item, value))
@@ -35,141 +42,41 @@ def __init__(self, items=()):
     def __getitem__(self, key):
         return dict.get(self, key, self.default)
 
-#Pure python implementation of deque taken from the ASPN Python Cookbook
-#Original code by Raymond Hettinger
-
-class deque(object):
-
-    def __init__(self, iterable=(), maxsize=-1):
-        if not hasattr(self, 'data'):
-            self.left = self.right = 0
-            self.data = {}
-        self.maxsize = maxsize
-        self.extend(iterable)
-
-    def append(self, x):
-        self.data[self.right] = x
-        self.right += 1
-        if self.maxsize != -1 and len(self) > self.maxsize:
-            self.popleft()
-        
-    def appendleft(self, x):
-        self.left -= 1        
-        self.data[self.left] = x
-        if self.maxsize != -1 and len(self) > self.maxsize:
-            self.pop()      
-        
-    def pop(self):
-        if self.left == self.right:
-            raise IndexError('cannot pop from empty deque')
-        self.right -= 1
-        elem = self.data[self.right]
-        del self.data[self.right]         
-        return elem
-    
-    def popleft(self):
-        if self.left == self.right:
-            raise IndexError('cannot pop from empty deque')
-        elem = self.data[self.left]
-        del self.data[self.left]
-        self.left += 1
-        return elem
-
-    def clear(self):
-        self.data.clear()
-        self.left = self.right = 0
-
-    def extend(self, iterable):
-        for elem in iterable:
-            self.append(elem)
-
-    def extendleft(self, iterable):
-        for elem in iterable:
-            self.appendleft(elem)
-
-    def rotate(self, n=1):
-        if self:
-            n %= len(self)
-            for i in xrange(n):
-                self.appendleft(self.pop())
-
-    def __getitem__(self, i):
-        if i < 0:
-            i += len(self)
-        try:
-            return self.data[i + self.left]
-        except KeyError:
-            raise IndexError
-
-    def __setitem__(self, i, value):
-        if i < 0:
-            i += len(self)        
-        try:
-            self.data[i + self.left] = value
-        except KeyError:
-            raise IndexError
-
-    def __delitem__(self, i):
-        size = len(self)
-        if not (-size <= i < size):
-            raise IndexError
-        data = self.data
-        if i < 0:
-            i += size
-        for j in xrange(self.left+i, self.right-1):
-            data[j] = data[j+1]
-        self.pop()
-    
-    def __len__(self):
-        return self.right - self.left
-
-    def __cmp__(self, other):
-        if type(self) != type(other):
-            return cmp(type(self), type(other))
-        return cmp(list(self), list(other))
-            
-    def __repr__(self, _track=[]):
-        if id(self) in _track:
-            return '...'
-        _track.append(id(self))
-        r = 'deque(%r)' % (list(self),)
-        _track.remove(id(self))
-        return r
-    
-    def __getstate__(self):
-        return (tuple(self),)
-    
-    def __setstate__(self, s):
-        self.__init__(s[0])
-        
-    def __hash__(self):
-        raise TypeError
-    
-    def __copy__(self):
-        return self.__class__(self)
-    
-    def __deepcopy__(self, memo={}):
-        from copy import deepcopy
-        result = self.__class__()
-        memo[id(self)] = result
-        result.__init__(deepcopy(tuple(self), memo))
-        return result
-
-#Some utility functions to dal with weirdness around UCS2 vs UCS4
-#python builds
-
-def encodingType():
-    if len() == 2:
-        return "UCS2"
-    else:
-        return "UCS4"
-
-def isSurrogatePair(data):   
+
+# Some utility functions to dal with weirdness around UCS2 vs UCS4
+# python builds
+
+def isSurrogatePair(data):
     return (len(data) == 2 and
             ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
             ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
 
+
 def surrogatePairToCodepoint(data):
-    char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 + 
+    char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
                 (ord(data[1]) - 0xDC00))
     return char_val
+
+# Module Factory Factory (no, this isn't Java, I know)
+# Here to stop this being duplicated all over the place.
+
+
+def moduleFactoryFactory(factory):
+    moduleCache = {}
+
+    def moduleFactory(baseModule, *args, **kwargs):
+        if isinstance(ModuleType.__name__, type("")):
+            name = "_%s_factory" % baseModule.__name__
+        else:
+            name = b"_%s_factory" % baseModule.__name__
+
+        if name in moduleCache:
+            return moduleCache[name]
+        else:
+            mod = ModuleType(name)
+            objs = factory(baseModule, *args, **kwargs)
+            mod.__dict__.update(objs)
+            moduleCache[name] = mod
+            return mod
+
+    return moduleFactory
diff --git a/planet/vendor/httplib2/__init__.py b/planet/vendor/httplib2-bak/__init__.py
similarity index 100%
rename from planet/vendor/httplib2/__init__.py
rename to planet/vendor/httplib2-bak/__init__.py
diff --git a/planet/vendor/httplib2/iri2uri.py b/planet/vendor/httplib2-bak/iri2uri.py
similarity index 100%
rename from planet/vendor/httplib2/iri2uri.py
rename to planet/vendor/httplib2-bak/iri2uri.py
diff --git a/tests/data/filter/tmpl/planet_name.xml b/tests/data/filter/tmpl/planet_name.xml
index 3c5b070..28690f7 100644
--- a/tests/data/filter/tmpl/planet_name.xml
+++ b/tests/data/filter/tmpl/planet_name.xml
@@ -3,13 +3,13 @@ Description:  id
 Expect:       Channels[0]['name'] == 'foo' and Items[0]['channel_name'] == 'foo'
 -->
 
-<feed xmlns="http://www.w3.org/2005/Atom">
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:planet='http://planet.intertwingly.net/'>
   <entry>
     <source>
       <planet:name>foo</planet:name>
     </source>
   </entry>
-  <planet:source xmlns:planet='http://planet.intertwingly.net/'>
+  <planet:source>
     <planet:name>foo</planet:name>
   </planet:source>
 </feed>
diff --git a/tests/data/filter/tmpl/source_planet_message.xml b/tests/data/filter/tmpl/source_planet_message.xml
index 67ae9cb..7f04888 100644
--- a/tests/data/filter/tmpl/source_planet_message.xml
+++ b/tests/data/filter/tmpl/source_planet_message.xml
@@ -3,13 +3,13 @@ Description:  message
 Expect:       Channels[0]['message'] == 'foo' and Items[0]['channel_message'] == 'foo'
 -->
 
-<feed xmlns="http://www.w3.org/2005/Atom">
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:planet='http://planet.intertwingly.net/'>
   <entry>
     <source>
       <planet:message>foo</planet:message>
     </source>
   </entry>
-  <planet:source xmlns:planet='http://planet.intertwingly.net/'>
+  <planet:source>
     <planet:message>foo</planet:message>
   </planet:source>
 </feed>
diff --git a/tests/data/filter/tmpl/source_planet_name.xml b/tests/data/filter/tmpl/source_planet_name.xml
index 3c5b070..28690f7 100644
--- a/tests/data/filter/tmpl/source_planet_name.xml
+++ b/tests/data/filter/tmpl/source_planet_name.xml
@@ -3,13 +3,13 @@ Description:  id
 Expect:       Channels[0]['name'] == 'foo' and Items[0]['channel_name'] == 'foo'
 -->
 
-<feed xmlns="http://www.w3.org/2005/Atom">
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:planet='http://planet.intertwingly.net/'>
   <entry>
     <source>
       <planet:name>foo</planet:name>
     </source>
   </entry>
-  <planet:source xmlns:planet='http://planet.intertwingly.net/'>
+  <planet:source>
     <planet:name>foo</planet:name>
   </planet:source>
 </feed>
diff --git a/tests/data/reconstitute/enclosure.xml b/tests/data/reconstitute/enclosure.xml
index 8820d09..e3ef352 100644
--- a/tests/data/reconstitute/enclosure.xml
+++ b/tests/data/reconstitute/enclosure.xml
@@ -1,6 +1,6 @@
 <!--
 Description:  enclosure
-Expect:       links[0].rel == 'enclosure' and id == 'http://example.com/1'
+Expect:       links[0].rel == 'enclosure' and links[0].href == 'http://example.com/1'
 -->
 
 <rss>
diff --git a/tests/data/reconstitute/georss_box_latlong.xml b/tests/data/reconstitute/georss_box_latlong.xml
index 4973fe8..c032ad2 100644
--- a/tests/data/reconstitute/georss_box_latlong.xml
+++ b/tests/data/reconstitute/georss_box_latlong.xml
@@ -3,7 +3,7 @@ Description:  box inside an entry (center point calculated)
 Expect:       geo_lat == '42.991000' and geo_long == '-70.444000'
 -->
 
-<feed xmlns="http://www.w3.org/2005/Atom">
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:georss="http://www.georss.org/georss">
   <entry>
     <georss:box>42.943 -71.032 43.039 -69.856</georss:box>
   </entry>
diff --git a/tests/data/reconstitute/georss_placeboxpolygon_latlong.xml b/tests/data/reconstitute/georss_placeboxpolygon_latlong.xml
index 396da5e..3712d58 100644
--- a/tests/data/reconstitute/georss_placeboxpolygon_latlong.xml
+++ b/tests/data/reconstitute/georss_placeboxpolygon_latlong.xml
@@ -1,6 +1,6 @@
 <!--
 Description:  polygon inside bounding box inside place inside an entry
-Expect:       geo_lat == '34.052610' and geo_long == '-118.432212'
+Expect:       geo_lat == '34.082509' and geo_long == '-118.402187'
 -->
 
 <feed xmlns="http://www.w3.org/2005/Atom"
diff --git a/tests/data/reconstitute/georss_point_latlong.xml b/tests/data/reconstitute/georss_point_latlong.xml
index f21deea..db20162 100644
--- a/tests/data/reconstitute/georss_point_latlong.xml
+++ b/tests/data/reconstitute/georss_point_latlong.xml
@@ -3,8 +3,7 @@ Description:  point inside an entry
 Expect:       geo_lat == '34.101646' and geo_long == '-118.326454'
 -->
 
-<feed xmlns="http://www.w3.org/2005/Atom"
-  xmlns:twitter="http://api.twitter.com">
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:georss="http://www.georss.org/georss">
   <entry>
     <georss:point>34.10164620,-118.32645359</georss:point>
   </entry>
diff --git a/tests/data/reconstitute/georss_polygon_latlong.xml b/tests/data/reconstitute/georss_polygon_latlong.xml
index e005b93..c649ff9 100644
--- a/tests/data/reconstitute/georss_polygon_latlong.xml
+++ b/tests/data/reconstitute/georss_polygon_latlong.xml
@@ -1,9 +1,9 @@
 <!--
 Description:  polygon inside an entry
-Expect:       geo_lat == '34.052610' and geo_long == '-118.432212'
+Expect:       geo_lat == '34.082509' and geo_long == '-118.402187'
 -->
 
-<feed xmlns="http://www.w3.org/2005/Atom">
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:georss="http://www.georss.org/georss">
   <entry>
     <georss:polygon>34.052610 -118.432212 34.05260997 -118.37216196 34.11240804 -118.37216196 34.11240804 -118.43221212</georss:polygon>
   </entry>
diff --git a/tests/data/reconstitute/gr_id.xml b/tests/data/reconstitute/gr_id.xml
index eca22b7..f827ab3 100644
--- a/tests/data/reconstitute/gr_id.xml
+++ b/tests/data/reconstitute/gr_id.xml
@@ -1,6 +1,6 @@
 <!--
 Description:  id
-Expect:       id == 'http://example.com/2'
+Expect:       id == 'http://example.com/1'
 -->
 
 <feed xmlns="http://www.w3.org/2005/Atom">
diff --git a/tests/test_expunge.py b/tests/test_expunge.py
index f4adb78..2b30bab 100644
--- a/tests/test_expunge.py
+++ b/tests/test_expunge.py
@@ -41,7 +41,7 @@ def test_expunge(self):
             eid = e.getElementsByTagName('id')
             efile = filename(workdir, eid[0].childNodes[0].nodeValue)
             eupdated = e.getElementsByTagName('updated')[0].childNodes[0].nodeValue
-            emtime = time.mktime(feedparser._parse_date_w3dtf(eupdated))
+            emtime = time.mktime(feedparser.datetimes.w3dtf._parse_date_w3dtf(eupdated))
             if not eid or not eupdated: continue
             shutil.copyfile(entry, efile)
             os.utime(efile, (emtime, emtime))
diff --git a/themes/musings/default.css b/themes/musings/default.css
index c059227..927dbea 100644
--- a/themes/musings/default.css
+++ b/themes/musings/default.css
@@ -138,6 +138,7 @@ h1 {
 
 #body {
 	margin-top: 10px;
+	max-width: 100%;
 }
 
 .admin {
@@ -400,3 +401,45 @@ table {
 	text-align: left;
 }
 
+/* The STIX Two fonts are licensed under the SIL Open Font License, Version 1.1, http://scripts.sil.org/OFL */
+math  {font-family: STIX Two Math;}
+mtext {font-family: STIX Two Text;}
+.mathscript {
+    font-variant-alternates: styleset(ss01); /* Recommended syntax */
+    font-feature-settings: "ss01" on;        /* For browsers which don't support the above */
+}
+
+@font-face {
+    font-family: STIX Two Text;
+    src: local('STIX Two Text'), local('STIX2Text-Regular'),
+         url('fonts/STIX2Text-Regular.woff2'), url('fonts/STIX2Text-Regular.woff');
+}
+@font-face {
+    font-family: STIX Two Text;
+    src: local('STIX Two Text Bold'), local('STIX2Text-Bold'),
+         url('fonts/STIX2Text-Bold.woff2'), url('fonts/STIX2Text-Bold.woff');
+    font-weight: bold;
+}
+@font-face {
+    font-family: STIX Two Text;
+    src: local('STIX Two Text Italic'), local('STIX2Text-Italic'),
+         url('fonts/STIX2Text-Italic.woff2'), url('fonts/STIX2Text-Italic.woff');
+    font-style: italic;
+}
+@font-face {
+    font-family: STIX Two Text;
+    src: local('STIX Two Text Bold Italic'), local('STIX2Text-BoldItalic'),
+         url('fonts/STIX2Text-BoldItalic.woff2'), url('fonts/STIX2Text-BoldItalic.woff');
+    font-weight: bold;
+    font-style: italic;
+}
+@font-face {
+    font-family: STIX Two Math;
+    src: local('STIX Two Math'), local('STIX2Math'),
+         url('fonts/STIX2Math.woff2'), url('fonts/STIX2Math.woff');
+}
+
+img {max-width: 100%; height: auto;}
+@media only screen and (max-device-width: 480px) {
+
+}
diff --git a/themes/musings/fonts/STIX2Math.woff b/themes/musings/fonts/STIX2Math.woff
new file mode 100644
index 0000000..188d600
Binary files /dev/null and b/themes/musings/fonts/STIX2Math.woff differ
diff --git a/themes/musings/fonts/STIX2Math.woff2 b/themes/musings/fonts/STIX2Math.woff2
new file mode 100644
index 0000000..ffff08b
Binary files /dev/null and b/themes/musings/fonts/STIX2Math.woff2 differ
diff --git a/themes/musings/fonts/STIX2Text-Bold.woff b/themes/musings/fonts/STIX2Text-Bold.woff
new file mode 100644
index 0000000..629f122
Binary files /dev/null and b/themes/musings/fonts/STIX2Text-Bold.woff differ
diff --git a/themes/musings/fonts/STIX2Text-Bold.woff2 b/themes/musings/fonts/STIX2Text-Bold.woff2
new file mode 100644
index 0000000..bdcef03
Binary files /dev/null and b/themes/musings/fonts/STIX2Text-Bold.woff2 differ
diff --git a/themes/musings/fonts/STIX2Text-BoldItalic.woff b/themes/musings/fonts/STIX2Text-BoldItalic.woff
new file mode 100644
index 0000000..f1ee39c
Binary files /dev/null and b/themes/musings/fonts/STIX2Text-BoldItalic.woff differ
diff --git a/themes/musings/fonts/STIX2Text-BoldItalic.woff2 b/themes/musings/fonts/STIX2Text-BoldItalic.woff2
new file mode 100644
index 0000000..5e842ce
Binary files /dev/null and b/themes/musings/fonts/STIX2Text-BoldItalic.woff2 differ
diff --git a/themes/musings/fonts/STIX2Text-Italic.woff b/themes/musings/fonts/STIX2Text-Italic.woff
new file mode 100644
index 0000000..908aaf1
Binary files /dev/null and b/themes/musings/fonts/STIX2Text-Italic.woff differ
diff --git a/themes/musings/fonts/STIX2Text-Italic.woff2 b/themes/musings/fonts/STIX2Text-Italic.woff2
new file mode 100644
index 0000000..d3f35b3
Binary files /dev/null and b/themes/musings/fonts/STIX2Text-Italic.woff2 differ
diff --git a/themes/musings/fonts/STIX2Text-Regular.woff b/themes/musings/fonts/STIX2Text-Regular.woff
new file mode 100644
index 0000000..8c79ab9
Binary files /dev/null and b/themes/musings/fonts/STIX2Text-Regular.woff differ
diff --git a/themes/musings/fonts/STIX2Text-Regular.woff2 b/themes/musings/fonts/STIX2Text-Regular.woff2
new file mode 100644
index 0000000..145efa0
Binary files /dev/null and b/themes/musings/fonts/STIX2Text-Regular.woff2 differ
diff --git a/themes/musings/index.html.xslt b/themes/musings/index.html.xslt
index 3c17db9..35024bc 100644
--- a/themes/musings/index.html.xslt
+++ b/themes/musings/index.html.xslt
@@ -16,6 +16,7 @@
         <link rel="stylesheet" href="default.css" type="text/css" />
         <title><xsl:value-of select="atom:title"/></title>
 	<meta name="robots" content="noindex,nofollow" />
+        <meta name="viewport" content="width=device-width, initial-scale=1"/>
         <meta name="generator" content="{atom:generator}" />
         <xsl:if test="atom:link[@rel='self']">
           <link rel="alternate" href="{atom:link[@rel='self']/@href}"
@@ -97,7 +98,7 @@
             </dd>
             <dt>Powered by:</dt>
             <dd>
-              <a href="http://intertwingly.net/code/venus/" title="Sam Ruby’s Venus">
+              <a href="http://github.com/rubys/venus/tree/master" title="Sam Ruby’s Venus">
                 <img src="images/venus.png" width="80" height="15"
                   alt="Planet" />
               </a>
diff --git a/themes/musings/personalize.js b/themes/musings/personalize.js
index 83db3a3..32196bc 100644
--- a/themes/musings/personalize.js
+++ b/themes/musings/personalize.js
@@ -207,6 +207,37 @@ function personalize() {
   findEntries(); 
   addOption();
   moveDateHeaders();
+  moveSidebar();
+}
+
+function retrieveTexSource() {
+	var maths = document.querySelectorAll('img.latex, img.tex, math');
+	for (var i = 0; i < maths.length; ++i) maths[i].addEventListener('dblclick', grabTex, false);
+	function grabTex(event){
+		var tex = '';
+		switch (this.tagName) {
+			case 'math':
+				tex = this.firstElementChild.lastElementChild.textContent;
+				break;
+			case 'img':
+				tex = this.getAttribute('alt');
+				break;
+		}
+		var win = window.open('','TeX','scrollbars,resizable,width=500,location=no,toolbar=no,titlebar=no,menubar=no,personalbar=no');
+		win.document.documentElement.lastElementChild.textContent = tex;
+		win.focus();
+	}
+}
+
+function moveSidebar() {
+  var mq = window.matchMedia('screen and (max-device-width: 480px)');
+  if (mq.matches) {
+    var sidebar = document.getElementById('sidebar');
+    if (sidebar) {
+      document.body.removeChild(sidebar);
+      document.body.appendChild(sidebar);
+    }
+  }
 }
 
 // hook event
@@ -215,6 +246,7 @@ if (document.addEventListener) {
     onDOMLoad = function() {
       window.onload = undefined;
       personalize();
+      retrieveTexSource();
     };
     document.addEventListener("DOMContentLoaded", onDOMLoad, false);
 }