#!/usr/bin/env python
import string
#Copyright (c) 2008, Patrick
#
#All rights reserved.
#
#Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
# * Neither the names of ohbah.com, secondpagemedia.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
#
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
def WSorText(String):
"""Returns the string as a WS object or as a Text object, depending on
which is which"""
if String.isspace():
return WS(String)
return Text(String)
def HasAttrs(String):
"""Check and see if an xml tag has arguments in it"""
if String.find(" ") != -1: #Probably need to add stuff here
return True
return False
def GrabAttrs(String):
"""Strips the attributes from an HTML tag and returns them as a hash."""
Start = String.find(" ")
#need to find all of the args.
ii = Start
Attr = {}
while ii < len(String):
while String[ii].isspace(): #Cut through the whitespace
ii += 1
if ii == len(String): return Attr
#Find the attributes's name
NStart = ii
while String[ii] != "=" and not String[ii].isspace():
ii += 1
if ii == len(String): return Attr
AttrName = String[NStart:ii]
if String[ii].isspace():
AttrValue = ''
else:
ii += 1
if ii == len(String): return Attr
Deliminator = String[ii]
if Deliminator != "'" and Deliminator != '"':
Deliminator = string.whitespace
ii += 1
if ii == len(String): return Attr
VStart = ii
while String[ii] not in Deliminator:
ii += 1
if ii == len(String): return Attr
AttrValue = String[VStart:ii]
Attr[AttrName.lower()] = AttrValue
ii += 1
if ii == len(String): return Attr
return Attr
#The following classes represent XML datatypes
class XMLStr(str):
"""Converts &stuff; things into their correct values on initalization,
other than that identical to a normal string"""
def __init__(Self, Value):
str.__init__(Value.replace("<", "<").replace(">", ">").replace("'", "'").replace(""", '"').replace("&", "&"))#XXX Need to work with xx;
class WS(XMLStr):
"""Essentaly a string, treat it as such"""
pass
class Text(XMLStr):
"""Essentaly a string, treat it as such"""
pass
class StartTag(XMLStr):
"""Essentaly a string, treat it as such"""
pass
class EndTag(XMLStr):
"""Essentaly a string, treat it as such"""
pass
class EETag(XMLStr):
"""Essentaly a string, treat it as such"""
pass
class ProcessingInstruction(XMLStr):
"""Essentaly a string, treat it as such"""
pass
class Comment(XMLStr):
"""Essentaly a string, treat it as such"""
pass
class TagAttr(str):
"""Much like a string, except TagAttr.Attr is a hash and
TagAttr.Keys() returns keys to the hash. Note that comparisons between
Two TagAttr (and there children) don't compare the hash, just the string"""
def __init__(Self, Value = ''):
str.__init__(Value)
Self.Attr = {}
def __repr__(Self):
return "'%s', %s" % (Self, Self.Attr)
class StartTagAttr(TagAttr):
"""A TagAttr with a different name"""
pass
class EETagAttr(TagAttr):
"""A TagAttr with a different name"""
pass
class Declaration(str):
"""Much like a string, defaults to the FOO in for XML
declarations. Declaration.Data contains everything to the right of the first
bit of whitespace. """
def __init__(Self, Value = ''):
str.__init__(Value)
Self.Data = ''
def __repr__(Self):
return "'%s', '%s'" % (Self, Self.Data)
class Parse:
"""Simple parser. reads data from a file-like stream (must support
read()) Returns another element every time that Read() is called, False if
you're at the end of the stream, and -1 on an Error. If there's an error,
Parse.Error will be updated to reflect the error.
Automagically converts entities (&sample;) in strings.
Possible elements are:
StartTag:
EndTag:
EETag:
Text: Strings found between the tags.
WS: Strings consisting entirely of tab, cr or spaces
StartTagAttr:
EETagAttr:
ProcessingInstruction:
Comment:
Declaration: """
def __init__(Self, Data):
Self.Data = Data
Self.Index = 0
Self.Error = ""
def __repr__(Self):
return "%s:'%s'" % (Self.Index, Self.Data[Self.Index:Self.Index + 20 < len(Self.Data) and Self.Index + 20 or len(Self.Data)])
def Read(Self):
"""Returns a single element, or false"""
Ret = -1
if Self.Index == len(Self.Data):
#We're at the end of the stream
return False
Spot = Self.Data.find("<", Self.Index) #Find the next tag
if Self.Data[Self.Index:Spot] == '\n': #Single newline
Self.Index = Spot #doesn't count
Spot = Self.Data.find("<", Self.Index)
if Spot == -1: #No tag found, must be text from here to the end
Ret = WSorText(Self.Data[Self.Index:])
Self.Index = len(Self.Data)
elif Spot == Self.Index: #Found a tag right under the index, so
#Basic flow for tags: #there's no text between here and there
#Identify the tag type -> Find the #Must be a tag of some sort
#Close tab -> extract the data, update the index
if Self.Data[Spot:Spot + 4] == "", Spot + 4)
if End == -1:
Self.Error = "Could not find closing bracket for a comment at %s" % Self.Index
return -1
Ret = Comment(Self.Data[Spot + 4:End].strip())
Self.Index = End + 3
elif Self.Data[Spot:Spot+ 2] == "":#It's a PI
End = Self.Data.find("?>", Spot + 2)
if End == -1:
Self.Error = "Could not find closing bracket for a Programming Instruction at %s" %Self.Index
return -1
Ret = ProcessingInstruction(Self.Data[Spot + 2:End - 1].strip())
Self.Index = End + 2
elif Self.Data[Spot:Spot + 2] == "", Spot + 2)
if End == -1:
Self.Error = "Could not find closing bracket for a declaration at %s" %Self.Index
return -1
Content = Self.Data[Spot + 2:End - 1].split()
Ret = Declaration(Content[0])
Ret.Data = " ".join(Content[1:])
Self.Index = End + 1
else: #Standard tag of some sort.
End = Self.Data.find(">", Spot + 1)
if End == -1:
Self.Error = "Could not find closing bracket for a declaration at %s" %Self.Index
return -1
Contents = Self.Data[Spot + 1:End]
Contents.strip()
if len(Contents) == 0:
Self.Error = "Empty tag found at %s" % Self.Index
return -1
Self.Index = End + 1
if Contents[0] == '/':#It's a closing tag
Ret = EndTag(Contents[1:].strip())
elif Contents[:-1] == '/':#It's an empty element
if HasAttrs(Contents[:-1]):
Ret = EETagAttr(Contents.split()[0])
Ret.Attr = GrabAttrs(Contents)
else:
Ret = EETag(Contents[:-1].strip())
else:#It's a start tag
if HasAttrs(Contents.strip()):
Ret = StartTagAttr(Contents.split()[0])
Ret.Attr = GrabAttrs(Contents)
else:
Ret = StartTag(Contents.strip())
#Now we find out what type of card it was.
else:
#The tag is somewhere in the future, ergo, everything
#up to the tag must be text
Ret = WSorText(Self.Data[Self.Index:Spot])
Self.Index = Spot
return Ret
def __iter__(Self):
return Self
def next(Self):
Tag = Self.Read()
if Tag == -1 or Tag == False: #There has been an error
raise StopIteration
return Tag
#ADD SAMPLE CODE TO DEMONSTRATE PARSER BY READING AND OUTPUTTING A BIT OF INFORMATION
import urllib2
curl = urllib2.build_opener()
#Testline: reload(psxml); F = open("test.xml"); Self = psxml.Parse(F)
if __name__ == "__main__":
#Example. Reprints the XML
import sys
#if len(sys.argv) != 2:
# print "Quck Demo: psxml.py "
# sys.exit(1)
#else:
if True:
#F = open(sys.argv[1])
#XML = Parse(F.read())
#F.close()
XML = Parse(curl.open("http://www.xkcd.com/10/").read())
TabSpot = 0
for ii in XML:
#check the type:
TabFill = "".zfill(TabSpot).replace("0", " ")
if ii.__class__ is WS:
pass #Ignore whitespace, we'll make our own
elif ii.__class__ is Text:
print TabFill + ii
elif ii.__class__ is StartTag:
print TabFill + "<" + ii + ">"
TabSpot += 1
elif ii.__class__ is EndTag:
print TabFill + "" + ii + ">"
TabSpot -= 1
elif ii.__class__ is EETag:
print TabFill + "<" + ii + " />"
elif ii.__class__ is ProcessingInstruction:
print TabFill + "" + ii + "?>"
elif ii.__class__ is Comment:
print TabFill + ""
elif ii.__class__ is StartTagAttr:
print TabFill + "<" + ii,
for jj in ii.Attr:
print '%s = "%s"' % (jj, ii.Attr[jj]),
print ">"
TabSpot += 1
elif ii.__class__ is EETagAttr:
print TabFill + "<" + ii,
for jj in ii.Attr:
print '%s = "%s"' % (jj, jj.Attr[jj]),
print "/>"
elif ii.__class__ is Declaration:
print TabFill + ""