Pythonのminidomパーサーを使って、XML文書からデータを抜き出します。
例えば以下のようなデータから、keiの状態を抜き出すスクリプトです。
<Data>
<Node>
<name>kei</name>
<state>fine</state>
</Node>
<Node>
<name>lei</name>
<state>sick</state>
</Node>
</Data>
まずはNodeというタグで始まる要素を全部抜き出してリストを作ります。
そのうちnameがkeiであるのを探して、要素をprintします。
import os
from xml.dom.minidom import parse,parseString
msg = """
<Data>
<Node>
<name>kei</name>
<state>fine</state>
</Node>
<Node>
<name>lei</name>
<state>sick</state>
</Node>
</Data>
"""
def lookupState(myname):
def getText(node):
for n in node.childNodes:
if n.nodeType == node.TEXT_NODE:
return n.data
else:
return ""
dom = parseString(msg)
for node in dom.getElementsByTagName("Node"):
name,state = '',''
for n in node.childNodes:
if n.nodeType not in [node.TEXT_NODE, node.COMMENT_NODE]:
if n.tagName == "name":
name = getText(n)
elif n.tagName == "state":
state = getText(n)
if name.find(myname) >= 0:
return state
else:
return None
name = 'kei'
state = lookupState(name)
print state
任意のXMLファイルを、MyXMLNodeによるツリー構造に読み込みます。
読み込んだツリーは、MyXMLParser.rootからアクセスできます。
ポイントは、
parser.StartElementHandler = self.begin_element
parser.EndElementHandler = self.end_element
...
というところかなぁ。"self."を付けて代入することで、メンバ関数も代入できる。
import xml.parsers.expat
class MyXMLParser :
# USAGE : parser = MyXMLParser("tasks.xml")
# Then, the root node can be accessed from parser.root
def __init__(self, filename):
if filename is not None:
self.parse(filename)
def parse(self, filename):
"""
Load an XML file
"""
self.current = None
self.root = None
parser = xml.parsers.expat.ParserCreate()
parser.StartElementHandler = self.begin_element
parser.EndElementHandler = self.end_element
parser.CharacterDataHandler = self.char_data
fp = open(filename, 'r')
parser.ParseFile(fp)
fp.close()
if self.current is not None:
raise "Illegally formatted XML"
def begin_element(self, name, attrs):
"""
Only internally used, called when encountering start tags
"""
newNode = MyXMLNode(name, self.current)
if self.current is not None:
self.current.children.append(newNode)
self.current = newNode
def end_element(self, name):
"""
Only internally used, called when encountering end tags
"""
if self.current.name != name:
err_str = "Illegally formatted XML : tag name "
err_str += name + " doesn't match ("
err_str += gb_XMLCurrent.name + " is expected)"
if self.current.parent is not None:
self.current = self.current.parent
else :
self.root = self.current
self.current = None
def char_data(self, data):
"""
Only internally used, called when encountering values
"""
if self.current is None:
raise "Illegally formatted XML"
self.current.append(data)
class MyXMLNode :
"""
A class corresponding to an XML node
"""
def __init__(self, name, parent):
self.name = name
self.parent = parent
self.children = []
self.value = ""
def append(self, value):
self.value += value
def __str__(self, spacer = ""):
self.value = self.value.strip()
ret = spacer + "<%s : %s>\n"%(self.name, self.value)
for child in self.children:
ret += child.__str__(spacer + " ")
ret += spacer + "</%s>\n"%(self.name)
return ret
parser = MyXMLParser("tasks.xml")
print parser.root
[an error occurred while processing this directive]