[ooo-build-commit] scratch/mso-dumper
Kohei Yoshida
kohei at kemper.freedesktop.org
Wed Dec 30 10:33:37 PST 2009
scratch/mso-dumper/src/node.py | 160 ++++++++++++++++++++++++++++++++++++
scratch/mso-dumper/src/xlsmodel.py | 11 ++
scratch/mso-dumper/src/xlsrecord.py | 6 +
scratch/mso-dumper/src/xlsstream.py | 47 +++++++++-
scratch/mso-dumper/xls-dump.py | 64 ++++++++++++--
5 files changed, 272 insertions(+), 16 deletions(-)
New commits:
commit 6a4764069739d404a708bcfb1ebef91bc8f22d69
Author: Kohei Yoshida <kyoshida at novell.com>
Date: Wed Dec 30 13:32:37 2009 -0500
[xls-dump] Added hook for canonical XML output.
* scratch/mso-dumper/src/node.py:
* scratch/mso-dumper/src/xlsmodel.py:
* scratch/mso-dumper/src/xlsrecord.py:
* scratch/mso-dumper/src/xlsstream.py:
* scratch/mso-dumper/xls-dump.py:
diff --git a/scratch/mso-dumper/src/node.py b/scratch/mso-dumper/src/node.py
new file mode 100644
index 0000000..e8d9119
--- /dev/null
+++ b/scratch/mso-dumper/src/node.py
@@ -0,0 +1,160 @@
+# This file (node.py) gets copied in several of my projects. Find out a way
+# to avoid making duplicate copies in each of my projects.
+
+import sys
+
+class NodeType:
+ # unknown node type.
+ Unknown = 0
+ # the document root - typically has only one child element, but it can
+ # have multiple children.
+ Root = 1
+ # node that has name and attributes, and may have child nodes.
+ Element = 2
+ # node that only has textural content.
+ Content = 3
+
+class NodeBase:
+ def __init__ (self, nodeType = NodeType.Unknown):
+ self.parent = None
+ self.__children = []
+ self.nodeType = nodeType
+
+ def appendChild (self, node):
+ self.__children.append(node)
+ node.parent = self
+
+ def appendElement (self, name):
+ node = Element(name)
+ self.appendChild(node)
+ return node
+
+ def appendContent (self, text):
+ node = Content(text)
+ self.appendChild(node)
+ return node
+
+ def firstChild (self):
+ return self.__children[0]
+
+ def setChildNodes (self, children):
+ self.__children = children
+
+ def getChildNodes (self):
+ return self.__children
+
+ def firstChildByName (self, name):
+ for child in self.__children:
+ if child.nodeType == NodeType.Element and child.name == name:
+ return child
+ return None
+
+ def getChildByName (self, name):
+ children = []
+ for child in self.__children:
+ if child.nodeType == NodeType.Element and child.name == name:
+ children.append(child)
+ return children
+
+class Root(NodeBase):
+ def __init__ (self):
+ NodeBase.__init__(self, NodeType.Root)
+
+class Content(NodeBase):
+ def __init__ (self, content):
+ NodeBase.__init__(self, NodeType.Content)
+ self.content = content
+
+class Element(NodeBase):
+ def __init__ (self, name, attrs=None):
+ NodeBase.__init__(self, NodeType.Element)
+ self.name = name
+ self.attrs = attrs
+ if self.attrs == None:
+ self.attrs = {}
+
+ def getContent (self):
+ text = ''
+ first = True
+ for child in self.getChildNodes():
+ if first:
+ first = False
+ else:
+ text += ' '
+ if child.nodeType == NodeType.Content:
+ text += child.content
+ elif child.nodeType == NodeType.Element:
+ text += child.getContent()
+ return text
+
+ def getAttr (self, name):
+ if not self.attrs.has_key(name):
+ return None
+ return self.attrs[name]
+
+ def setAttr (self, name, val):
+ self.attrs[name] = val
+
+ def hasAttr (self, name):
+ return self.attrs.has_key(name)
+
+encodeTable = {
+ '>': 'gt',
+ '<': 'lt',
+ '&': 'amp',
+ '"': 'quot',
+ '\'': 'apos'
+}
+
+def encodeString (sin):
+ sout = ''
+ for c in sin:
+ if ord(c) >= 128:
+ # encode non-ascii ranges.
+ sout += "\\x%2.2x"%ord(c)
+ elif encodeTable.has_key(c):
+ # encode html symbols.
+ sout += '&' + encodeTable[c] + ';'
+ else:
+ sout += c
+
+ return sout
+
+
+def prettyPrint (fd, node):
+ printNode(fd, node, 0)
+
+def printNode (fd, node, level):
+ singleIndent = ' '*4
+ indent = singleIndent*level
+ if node.nodeType == NodeType.Root:
+ # root node itself only contains child nodes.
+ for child in node.getChildNodes():
+ printNode(fd, child, level)
+ elif node.nodeType == NodeType.Element:
+ hasChildren = len(node.getChildNodes()) > 0
+
+ # We add '<' and '>' (or '/>') after the element content gets
+ # encoded.
+ line = node.name
+ if len(node.attrs) > 0:
+ keys = node.attrs.keys()
+ keys.sort()
+ for key in keys:
+ line += " " + key + '="' + encodeString(node.attrs[key]) + '"'
+ if hasChildren:
+ line = "<%s>\n"%line
+ fd.write (indent + line)
+ for child in node.getChildNodes():
+ printNode(fd, child, level+1)
+ line = "</%s>\n"%node.name
+ fd.write (indent + line)
+ else:
+ line = "<%s/>\n"%line
+ fd.write (indent + line)
+
+ elif node.nodeType == NodeType.Content:
+ content = node.content.strip()
+ content = encodeString(content)
+ if len(content) > 0:
+ fd.write (indent + content + "\n")
diff --git a/scratch/mso-dumper/src/xlsmodel.py b/scratch/mso-dumper/src/xlsmodel.py
new file mode 100644
index 0000000..5a572d0
--- /dev/null
+++ b/scratch/mso-dumper/src/xlsmodel.py
@@ -0,0 +1,11 @@
+
+import globals, node
+
+class Workbook(object):
+
+ def __init__ (self):
+ pass
+
+ def createDOM (self):
+ nd = node.Element('workbook')
+ return nd
diff --git a/scratch/mso-dumper/src/xlsrecord.py b/scratch/mso-dumper/src/xlsrecord.py
index f55a5a0..2c0e8fc 100644
--- a/scratch/mso-dumper/src/xlsrecord.py
+++ b/scratch/mso-dumper/src/xlsrecord.py
@@ -35,6 +35,12 @@ append a line to be displayed.
"""
pass
+ def fillModel (self, model):
+ """Parse the original bytes and populate the workbook model.
+
+Like parseBytes(), the derived classes must overwrite this method."""
+ pass
+
def output (self):
self.parseBytes()
print("%4.4Xh: %s"%(self.header, "-"*61))
diff --git a/scratch/mso-dumper/src/xlsstream.py b/scratch/mso-dumper/src/xlsstream.py
index 686db95..adca9f3 100644
--- a/scratch/mso-dumper/src/xlsstream.py
+++ b/scratch/mso-dumper/src/xlsstream.py
@@ -399,7 +399,7 @@ class XLDirStream(object):
def __printSep (self, c='-', w=68, prefix=''):
print(prefix + c*w)
- def readRecord (self):
+ def __readRecordBytes (self):
if self.size - self.pos < 4:
raise EndOfStream
@@ -409,6 +409,45 @@ class XLDirStream(object):
raise EndOfStream
size = self.readRaw(2)
bytes = self.readByteArray(size)
+ return pos, header, size, bytes
+
+ def __getRecordHandler (self, header, size, bytes):
+ # record handler that parses the raw bytes and displays more
+ # meaningful information.
+ handler = None
+ if recData.has_key(header) and len(recData[header]) >= 3:
+ handler = recData[header][2](header, size, bytes, self.strmData)
+
+ if handler != None and self.strmData.encrypted:
+ # record handler exists. Parse the record and display more info
+ # unless the stream is encrypted.
+ handler = None
+
+ return handler
+
+ def __postReadRecord (self, header):
+ if recData.has_key(header) and recData[header][0] == "FILEPASS":
+ # presence of FILEPASS record indicates that the stream is
+ # encrypted.
+ self.strmData.encrypted = True
+
+ def fillModel (self, model):
+ pos, header, size, bytes = self.__readRecordBytes()
+ handler = self.__getRecordHandler(header, size, bytes)
+ if handler != None:
+ handler.fillModel(model)
+ self.__postReadRecord(header)
+
+
+ def readRecordXML (self):
+ pos, header, size, bytes = self.__readRecordBytes()
+ handler = self.__getRecordHandler(header, size, bytes)
+ print (recData[header][1])
+ self.__postReadRecord(header)
+ return header
+
+ def readRecord (self):
+ pos, header, size, bytes = self.__readRecordBytes()
# record handler that parses the raw bytes and displays more
# meaningful information.
@@ -449,9 +488,5 @@ class XLDirStream(object):
# unless the stream is encrypted.
handler.output()
- if recData.has_key(header) and recData[header][0] == "FILEPASS":
- # presence of FILEPASS record indicates that the stream is
- # encrypted.
- self.strmData.encrypted = True
-
+ self.__postReadRecord(header)
return header
diff --git a/scratch/mso-dumper/xls-dump.py b/scratch/mso-dumper/xls-dump.py
index 9ec6e35..56e025b 100755
--- a/scratch/mso-dumper/xls-dump.py
+++ b/scratch/mso-dumper/xls-dump.py
@@ -2,7 +2,7 @@
import sys, os.path, optparse
sys.path.append(sys.path[0]+"/src")
-import ole, xlsstream, globals
+import ole, xlsstream, globals, node, xlsmodel
from globals import error
@@ -31,7 +31,29 @@ class XLDumper(object):
self.__parseFile()
dirnames = self.strm.getDirectoryNames()
for dirname in dirnames:
- print (dirname)
+ if dirname != "Workbook":
+ # for now, we only dump the Workbook directory stream.
+ continue
+
+ dirstrm = self.strm.getDirectoryStreamByName(dirname)
+ self.__readSubStreamXML(dirstrm)
+
+ def dumpCanonicalXML (self):
+ self.__parseFile()
+ dirnames = self.strm.getDirectoryNames()
+ docroot = node.Root()
+ root = docroot.appendElement('xls-dump')
+
+ for dirname in dirnames:
+ if dirname != "Workbook":
+ # for now, we only dump the Workbook directory stream.
+ continue
+
+ dirstrm = self.strm.getDirectoryStreamByName(dirname)
+ wbmodel = self.__buildWorkbookModel(dirstrm)
+ root.appendChild(wbmodel.createDOM())
+
+ node.prettyPrint(sys.stdout, docroot)
def dump (self):
self.__parseFile()
@@ -75,17 +97,33 @@ class XLDumper(object):
except xlsstream.EndOfStream:
return False
+ def __readSubStreamXML (self, strm):
+ try:
+ while True:
+ strm.readRecordXML()
+ except xlsstream.EndOfStream:
+ pass
+
+ def __buildWorkbookModel (self, strm):
+ model = xlsmodel.Workbook()
+ try:
+ while True:
+ strm.fillModel(model)
+ except xlsstream.EndOfStream:
+ pass
+
+ return model
def main ():
parser = optparse.OptionParser()
parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False,
- help="turn on debug mode")
+ help="Turn on debug mode")
parser.add_option("--show-sector-chain", action="store_true", dest="show_sector_chain", default=False,
- help="show sector chain information at the start of the output.")
+ help="Show sector chain information at the start of the output.")
parser.add_option("--show-stream-pos", action="store_true", dest="show_stream_pos", default=False,
- help="show the position of each record relative to the stream.")
- parser.add_option("--dump-xml", action="store_true", dest="dump_xml", default=False,
- help="dump content in XML format.")
+ help="Show the position of each record relative to the stream.")
+ parser.add_option("--dump-mode", dest="dump_mode", default="flat", metavar="MODE",
+ help="Specify the dump mode. Possible values are: 'flat', 'xml', or 'canonical-xml'. The default value is 'flat'.")
options, args = parser.parse_args()
params = globals.Params()
params.debug = options.debug
@@ -95,13 +133,19 @@ def main ():
if len(args) < 1:
globals.error("takes at least one argument\n")
parser.print_help()
- return
+ sys.exit(1)
dumper = XLDumper(args[0], params)
- if options.dump_xml:
+ if options.dump_mode == 'flat':
+ dumper.dump()
+ elif options.dump_mode == 'xml':
dumper.dumpXML()
+ elif options.dump_mode == 'canonical-xml':
+ dumper.dumpCanonicalXML()
else:
- dumper.dump()
+ error("unknown dump mode: '%s'\n"%options.dump_mode)
+ parser.print_help()
+ sys.exit(1)
if __name__ == '__main__':
main()
More information about the ooo-build-commit
mailing list