Compile HTML into a cPickle'able Object

This code can compile and optimize, but it doesn't yet subclass objects:

from HTMLParser import HTMLParser
 
class NoTag:
    def __init__(self):
        self.Content, self.Parent = [], None
 
    def __str__(self):
        return "".join(map(str, self.Content))
 
    def __repr__(self):
        return "<%s %r>" % (self.__class__.__name__, self.Content)
 
    def _Pre(self):
        return ""
 
    def _Post(self):
        return ""
 
    def Optimize(self):
        self._Optimize1()
        self._Optimize2()
 
    def _Optimize1(self):
        i = 0
        while i < len(self.Content):
            T = self.Content[i]
            if hasattr(T, "TALAttrs") and not T.TALAttrs:
                T._Optimize1()
                self.Content[i:i+1] = [T._Pre()] + T.Content + [T._Post()]
            i += 1
 
    def _Optimize2(self):
        i = 0
        while i < (len(self.Content) - 1):
            T = self.Content[i]
            if hasattr(T, "_Optimize2"):
                T._Optimize2()
                i += 1
            elif (type(T) == str) and (type(self.Content[i + 1]) == str):
                self.Content[i:i+2] = [T + self.Content[i + 1]]
            else:
                i += 1
 
class Tag(NoTag):
    def __init__(self, Type, Attrs):
        self.Type, self.Attrs, self.TALAttrs = Type, {}, {}
        for Attr in Attrs:
            if Attr.Name.lower().startswith("tal:"):
                self.TALAttrs[Attr.Name] = Attr
            else:
                self.Attrs[Attr.Name] = Attr
        NoTag.__init__(self)
 
    def _Pre(self):
        return "<%s>" % " ".join([self.Type] + map(str, self.Attrs.values() +
            self.TALAttrs.values()))
 
    def _Post(self):
        return "</%s>" % self.Type
 
    def __str__(self):
        if self.Content:
            return "%s%s%s" % (self._Pre(), NoTag.__str__(self),
                self._Post())
        else:
            return "<%s />" % " ".join([self.Type] + \\
 
                map(str, self.Attrs.values() + self.TALAttrs.values()))
 
    def __repr__(self):
        return "<%s Type=%s Attrs=%r TALAttrs=%r Content=%r>" % \\
 
            (self.__class__.__name__, self.Type, self.Attrs, self.TALAttrs,
                self.Content)
 
class Attribute:
    def __init__(self, Attrs):
        self.Name, self.Value = Attrs
 
    def __str__(self):
        if self.Value == None:
            return self.Name
        else:
            return "%s=%r" % (self.Name, self.Value)
 
    def __repr__(self):
        return "<%s %s=%r>" % (self.__class__.__name__, self.Name, self.Value)
 
class ZRParser(HTMLParser):
    def __init__(self):
        self.RetVal = self.AddPos = NoTag()
        HTMLParser.__init__(self)
 
    def Add(self, Obj):
        if hasattr(self.AddPos, "Content"):
            self.AddPos.Content.append(Obj)
        else:
            self.AddPos.append(Obj)
 
    def handle_starttag(self, Type, Attrs):
        T = Tag(Type, map(Attribute, Attrs))
        T.Parent = self.AddPos
        self.Add(T)
        self.AddPos = T
 
    def handle_endtag(self, Type):
        if self.AddPos.Type == Type:
            self.AddPos = self.AddPos.Parent
        else:
            # End tag does not match with the last parent!  So the last items
            # we have processed are actually siblings to the tag, not children.
            self.AddPos = self.AddPos.Parent
            if self.AddPos:
                Content = getattr(self.AddPos, "Content", self.AddPos)
                Tag = Content[-1]
                Content.extend(Tag.Content)
                Tag.Content = []
                self.handle_endtag(Type)
 
    def handle_data(self, Text):
        self.Add(Text)
 
    def handle_charref(self, Name):
        self.Add("&#%s;" % Name)
 
    def handle_entityref(self, Name):
        self.Add("&%s;" % Name)
 
    def handle_comment(self, Text):
        self.Add("<!--%s-->" % Text)

Here it is in action:

Python 2.4.3 (#69, Mar 29 2006, 17:35:34) [MSC v.1310 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from html import ZRParser
>>> Parser = ZRParser()
>>> Parser.feed("""<html>
... <head>
...   <title>a title</title>
... </head>
... <body>
...   <h1 tal:content="header">heading text</h1>
...   <!-- a comment -->
...   <p>
...     paragraph text<br>
...     and more<br />
...     &copy; 2006 &#127;
...   </p>
...   <input type=radio checked name=yup>
... </body>
... </html>""")
>>> Parser.RetVal.Optimize()
>>> print str(Parser.RetVal)
<html>
<head>
  <title>a title</title>
</head>
<body>
  <h1 tal:content='header'>heading text</h1>
  <!-- a comment -->
  <p>
    paragraph text<br></br>
    and more<br></br>
    &copy; 2006 &#127;
  </p>
  <input checked type='radio' name='yup'></input>
</body>
</html>
>>> print repr(Parser.RetVal)
<NoTag ['<html>\n<head>\n  <title>a title</title>\n</head>\n<body>\n  ',
    <Tag Type=h1 Attrs={} TALAttrs={'tal:content': <Attribute tal:content='header'>}
        Content=['heading text']>,
        "\n  <!-- a comment -->\n  <p>\n    paragraph text<br></br>\n"
        "    and more<br></br>\n    &copy; 2006 &#127;\n  </p>\n"
        "  <input checked type='radio' name='yup'></input>\n</body>\n</html>"]>

I did a little formatting with that repr() so it would be more legible.

Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-Share Alike 2.5 License.