|
Server : Apache/2.2.2 (Fedora) System : Linux App1.pathumtani.go.th 2.6.20-1.2320.fc5smp #1 SMP Tue Jun 12 19:40:16 EDT 2007 i686 User : apache ( 48) PHP Version : 5.2.9 Disable Function : NONE Directory : /proc/self/root/usr/lib/python2.4/site-packages/_xmlplus/dom/ext/reader/ |
Upload File : |
import string, re, types, sys
from xml.parsers import sgmlop
from xml.dom import implementation
from xml.dom import Node
from xml.dom import NotSupportedErr
from xml.dom import EMPTY_NAMESPACE
from xml.dom.html import HTML_DTD, HTML_CHARACTER_ENTITIES
DEFAULT_CHARSET = 'ISO-8859-1'
_root = '(?P<root>[a-zA-Z][a-zA-Z0-9]*)'
_quoted = '("[^"]*")|' + "('[^']*')"
_sysId = r'\s*(?P<system%d>' + _quoted + ')'
_pubId = r'\s*PUBLIC\s*(?P<public>' + _quoted + '(' + (_sysId % 1) + ')?)'
_sysId = 'SYSTEM' + (_sysId % 2)
_doctype = re.compile('DOCTYPE ' + _root + '(%s|%s)?' % (_pubId, _sysId), re.I)
try:
unicode()
except:
from xml.unicode.iso8859 import wstring
wstring.install_alias('ISO-8859-1', 'ISO_8859-1:1987')
def unicode(str, encoding='US-ASCII'):
"""Create a UTF-8 string"""
try:
return wstring.decode(string.upper(encoding), str).utf8()
except:
return str
def unichr(char):
"""Create a UTF-8 string from a Unicode character code"""
try:
return wstring.chr(char).utf8()
except:
return char
class SgmlopParser:
def __init__(self, entities=None):
self.entities = {'amp' : '&',
'apos' : "'",
'lt' : '<',
'gt' : '>',
'quot' : '"',
}
entities and self.entities.update(entities)
def initParser(self, parser):
self._parser = parser
self._parser.register(self)
return
def initState(self, ownerDoc=None):
raise NotImplementError('initState: ownerDoc=%s' % ownerDoc)
def parse(self, stream):
self._parser.parse(stream.read())
return
def handle_special(self, data):
"""Handles <!...> directives"""
raise NotImplementedError('handle_special: data=%s' % data)
def handle_proc(self, target, data):
"""Handles processing instructions."""
raise NotImplementedError('handle_proc: target=%s, data=%s' % (target, data))
def finish_starttag(self, tagname, attrs):
"""
In XML mode attrs is a dictionary, otherwise a list.
"""
raise NotImplementedError('finish_starttag: name=%s' % tagname)
def finish_endtag(self, tagname):
raise NotImplementedError('finish_endtag: name=%s' % tagname)
def handle_entityref(self, name):
if self.entities.has_key(name):
self.handle_data(self.entities[name])
else:
self.unknown_entityref(name)
return
#Handled internally in sgmlop, but can be overridden
#def handle_charref(self, char):
# # char is a string number
# # either DDD or xHHH
# if char[0] == 'x':
# self.handle_data(chr(eval('0' + char)))
# else:
# self.handle_data(chr(int(char)))
# return
def handle_cdata(self, data):
raise NotImplementedError('handle_cdata: data=%s' % data)
def handle_data(self, data):
raise NotImplementedError('handle_data: data=%s' % data)
def handle_comment(self, data):
raise NotImplementedError('handle_comment: data=%s' % data)
def unknown_endtag(self, name): pass
def unknown_entityref(self, name): pass
g_reCharset = re.compile(r'charset\s*=\s*(?P<charset>[a-zA-Z0-9_\-]+)')
HTML_ENTITIES = {}
for (char, name) in HTML_CHARACTER_ENTITIES.items():
HTML_ENTITIES[name] = unichr(char)
class HtmlParser(SgmlopParser):
def __init__(self):
SgmlopParser.__init__(self, HTML_ENTITIES)
def initParser(self):
SgmlopParser.initParser(self, sgmlop.SGMLParser())
def initState(self, ownerDoc=None, charset=''):
self._ownerDoc = ownerDoc or implementation.createHTMLDocument('')
self._charset = charset or DEFAULT_CHARSET
self.rootNode = self._ownerDoc.createDocumentFragment()
self._stack = [self.rootNode]
self._hasHtml = 0
return
def handle_special(self, data):
# This would be a doctype, but HTML DOMs do not use them
return
def handle_proc(self, target, data):
# HTML DOMs do not support processing instructions either.
return
def finish_starttag(self, tagname, attrs):
unicodeTagName = unicode(tagname, self._charset)
lowerTagName = string.lower(unicodeTagName)
if not HTML_DTD.has_key(lowerTagName):
# Skip any tags not defined in HTML 4.01
return
element = self._ownerDoc.createElementNS(EMPTY_NAMESPACE, unicodeTagName)
# Allows for multiple META tags in a document
if lowerTagName == 'meta':
lowered = map(lambda (name, value):
(string.lower(name), string.lower(value)),
attrs)
if ('http-equiv', 'content-type') in lowered:
for (name, value) in lowered:
if name == 'content':
match = g_reCharset.search(value)
if match:
self._charset = match.group('charset')
# Add any attributes to the tag
for (name, value) in attrs:
element.setAttributeNS(EMPTY_NAMESPACE, unicode(name, self._charset),
unicode(value, self._charset))
# Look for its parent
for i in range(1, len(self._stack)):
parent = self._stack[-i]
if lowerTagName in HTML_DTD[string.lower(parent.tagName)]:
parent.appendChild(element)
if i > 1:
self._stack = self._stack[:-i+1]
if HTML_DTD[lowerTagName]:
self._stack.append(element)
return
# no parent found
if not self._hasHtml and lowerTagName == 'html':
self._stack[0].appendChild(element)
self._stack.append(element)
self._hasHtml = 1
return
def finish_endtag(self, tagname):
uppercase = string.upper(unicode(tagname, self._charset))
# Look for opening tag
for i in range(1, len(self._stack)):
element = self._stack[-i]
if uppercase == element.tagName:
self._stack = self._stack[:-i]
break
return
def handle_entityref(self, name):
if self.entities.has_key(name):
unidata = self.entities[name]
node = self._stack[-1]
text_node = node.lastChild or node
if text_node.nodeType == Node.TEXT_NODE:
text_node.appendData(unidata)
else:
node.appendChild(self._ownerDoc.createTextNode(unidata))
else:
self.unknown_entityref(name)
return
def handle_data(self, data):
unidata = unicode(data, self._charset)
node = self._stack[-1]
text_node = node.lastChild or node
if text_node.nodeType == Node.TEXT_NODE:
text_node.appendData(unidata)
else:
node.appendChild(self._ownerDoc.createTextNode(unidata))
return
def handle_charref(self, value):
# Can't rely on sgmlop to handle charrefs itself: it can't
# report Unicode (since it won't know the document encoding),
# and it may encounter non-ASCII characters
if value[0] == 'x':
value = int(value[1:], 16)
else:
value = int(value)
unidata = unichr(value)
node = self._stack[-1]
text_node = node.lastChild or node
if text_node.nodeType == Node.TEXT_NODE:
text_node.appendData(unidata)
else:
node.appendChild(self._ownerDoc.createTextNode(unidata))
return
def handle_comment(self, data):
comment = self._ownerDoc.createComment(data)
self._stack[-1].appendChild(comment)
return
class XmlParser(SgmlopParser):
def initParser(self):
SgmlopParser.initParser(self, sgmlop.XMLParser())
def initState(self, ownerDoc=None):
self._ownerDoc = None
#Set up the stack which keeps track of the nesting of DOM nodes.
if ownerDoc:
self._ownerDoc = ownerDoc
#Create a docfrag to hold all the generated nodes.
self._rootNode = self._ownerDoc.createDocumentFragment()
self._stack = [self._rootNode]
else:
self._rootNode = None
self._stack = []
self._dt = None
self._xmlDecl = None
self._orphanedNodes = []
self._namespaces = {'xml': XML_NAMESPACE}
self._namespaceStack = []
self._currText = ''
return
def finish_starttag(self, tagname, attrs):
old_nss = {}
del_nss = []
split_attrs = {}
for (name, value) in attrs.items():
(prefix, local) = SplitQName(name)
split_attrs[(prefix, local, name)] = value
if local == 'xmlns':
if self._namespaces.has_key(prefix):
old_nss[prefix] = self._namespaces[prefix]
else:
del_nss.append(prefix)
if prefix or value:
self._namespaces[prefix] = value
else:
del_nss.append(prefix)
self._namespaceStack.append((old_nss, del_nss))
(prefix, local) = SplitQName(tagname)
namespace = self._namespaces.get(prefix, None)
element = self._ownerDoc.createElementNS(namespace, tagname)
for ((prefix, local, name), value) in split_attrs.items():
if local == 'xmlns':
namespace = XMLNS_NAMESPACE
else:
namespace = self._namespaces.get(prefix, None)
attr = self._ownerDoc.createAttributeNS(namespace, name)
attr.value = value
element.setAttributeNodeNS(attr)
self._stack.append(element)
def finish_endtag(self, tagname):
element = self._stack.pop()
(old_nss, del_nss) = self._namespaceStack.pop()
self._namespaces.update(old_nss)
for prefix in del_nss:
del self._namespaces[prefix]
self._stack[-1].appendChild(element)
return