[ooo-build-commit] scratch/mso-dumper

Wed Dec 30 10:33:37 PST 2009

scratch/mso-dumper/src/node.py      |  160 ++++++++++++++++++++++++++++++++++++
 scratch/mso-dumper/src/xlsmodel.py  |   11 ++
 scratch/mso-dumper/src/xlsrecord.py |    6 +
 scratch/mso-dumper/src/xlsstream.py |   47 +++++++++-
 scratch/mso-dumper/xls-dump.py      |   64 ++++++++++++--
 5 files changed, 272 insertions(+), 16 deletions(-)

New commits:
commit 6a4764069739d404a708bcfb1ebef91bc8f22d69
Author: Kohei Yoshida <kyoshida at novell.com>
Date:   Wed Dec 30 13:32:37 2009 -0500

    [xls-dump] Added hook for canonical XML output.
    
    * scratch/mso-dumper/src/node.py:
    * scratch/mso-dumper/src/xlsmodel.py:
    * scratch/mso-dumper/src/xlsrecord.py:
    * scratch/mso-dumper/src/xlsstream.py:
    * scratch/mso-dumper/xls-dump.py:

diff --git a/scratch/mso-dumper/src/node.py b/scratch/mso-dumper/src/node.py
new file mode 100644
index 0000000..e8d9119
--- /dev/null
+++ b/scratch/mso-dumper/src/node.py
@@ -0,0 +1,160 @@
+# This file (node.py) gets copied in several of my projects.  Find out a way 
+# to avoid making duplicate copies in each of my projects.
+
+import sys
+
+class NodeType:
+    # unknown node type.
+    Unknown = 0
+    # the document root - typically has only one child element, but it can 
+    # have multiple children.
+    Root    = 1 
+    # node that has name and attributes, and may have child nodes.
+    Element = 2
+    # node that only has textural content.
+    Content = 3
+
+class NodeBase:
+    def __init__ (self, nodeType = NodeType.Unknown):
+        self.parent = None
+        self.__children = []
+        self.nodeType = nodeType
+
+    def appendChild (self, node):
+        self.__children.append(node)
+        node.parent = self
+
+    def appendElement (self, name):
+        node = Element(name)
+        self.appendChild(node)
+        return node
+
+    def appendContent (self, text):
+        node = Content(text)
+        self.appendChild(node)
+        return node
+
+    def firstChild (self):
+        return self.__children[0]
+
+    def setChildNodes (self, children):
+        self.__children = children
+
+    def getChildNodes (self):
+        return self.__children
+
+    def firstChildByName (self, name):
+        for child in self.__children:
+            if child.nodeType == NodeType.Element and child.name == name:
+                return child
+        return None
+
+    def getChildByName (self, name):
+        children = []
+        for child in self.__children:
+            if child.nodeType == NodeType.Element and child.name == name:
+                children.append(child)
+        return children
+
+class Root(NodeBase):
+    def __init__ (self):
+        NodeBase.__init__(self, NodeType.Root)
+
+class Content(NodeBase):
+    def __init__ (self, content):
+        NodeBase.__init__(self, NodeType.Content)
+        self.content = content
+
+class Element(NodeBase):
+    def __init__ (self, name, attrs=None):
+        NodeBase.__init__(self, NodeType.Element)
+        self.name = name
+        self.attrs = attrs
+        if self.attrs == None:
+            self.attrs = {}
+
+    def getContent (self):
+        text = ''
+        first = True
+        for child in self.getChildNodes():
+            if first:
+                first = False
+            else:
+                text += ' '
+            if child.nodeType == NodeType.Content:
+                text += child.content
+            elif child.nodeType == NodeType.Element:
+                text += child.getContent()
+        return text
+
+    def getAttr (self, name):
+        if not self.attrs.has_key(name):
+            return None
+        return self.attrs[name]
+
+    def setAttr (self, name, val):
+        self.attrs[name] = val
+
+    def hasAttr (self, name):
+        return self.attrs.has_key(name)
+
+encodeTable = {
+    '>': 'gt',
+    '<': 'lt',
+    '&': 'amp',
+    '"': 'quot',
+    '\'': 'apos'
+}
+
+def encodeString (sin):
+    sout = ''
+    for c in sin:
+        if ord(c) >= 128:
+            # encode non-ascii ranges.
+            sout += "\\x%2.2x"%ord(c)
+        elif encodeTable.has_key(c):
+            # encode html symbols.
+            sout += '&' + encodeTable[c] + ';'
+        else:
+            sout += c
+
+    return sout
+
+
+def prettyPrint (fd, node):
+    printNode(fd, node, 0)
+
+def printNode (fd, node, level):
+    singleIndent = ' '*4
+    indent = singleIndent*level
+    if node.nodeType == NodeType.Root:
+        # root node itself only contains child nodes.
+        for child in node.getChildNodes():
+            printNode(fd, child, level)
+    elif node.nodeType == NodeType.Element:
+        hasChildren = len(node.getChildNodes()) > 0
+
+        # We add '<' and '>' (or '/>') after the element content gets 
+        # encoded.
+        line = node.name
+        if len(node.attrs) > 0:
+            keys = node.attrs.keys()
+            keys.sort()
+            for key in keys:
+                line += " " + key + '="' + encodeString(node.attrs[key]) + '"'
+        if hasChildren:
+            line = "<%s>\n"%line
+            fd.write (indent + line)
+            for child in node.getChildNodes():
+                printNode(fd, child, level+1)
+            line = "</%s>\n"%node.name
+            fd.write (indent + line)
+        else:
+            line = "<%s/>\n"%line
+            fd.write (indent + line)
+
+    elif node.nodeType == NodeType.Content:
+        content = node.content.strip()
+        content = encodeString(content)
+        if len(content) > 0:
+            fd.write (indent + content + "\n")
diff --git a/scratch/mso-dumper/src/xlsmodel.py b/scratch/mso-dumper/src/xlsmodel.py
new file mode 100644
index 0000000..5a572d0
--- /dev/null
+++ b/scratch/mso-dumper/src/xlsmodel.py
@@ -0,0 +1,11 @@
+
+import globals, node
+
+class Workbook(object):
+
+    def __init__ (self):
+        pass
+
+    def createDOM (self):
+        nd = node.Element('workbook')
+        return nd
diff --git a/scratch/mso-dumper/src/xlsrecord.py b/scratch/mso-dumper/src/xlsrecord.py
index f55a5a0..2c0e8fc 100644
--- a/scratch/mso-dumper/src/xlsrecord.py
+++ b/scratch/mso-dumper/src/xlsrecord.py
@@ -35,6 +35,12 @@ append a line to be displayed.
 """
         pass
 
+    def fillModel (self, model):
+        """Parse the original bytes and populate the workbook model.
+
+Like parseBytes(), the derived classes must overwrite this method."""
+        pass
+
     def output (self):
         self.parseBytes()
         print("%4.4Xh: %s"%(self.header, "-"*61))
diff --git a/scratch/mso-dumper/src/xlsstream.py b/scratch/mso-dumper/src/xlsstream.py
index 686db95..adca9f3 100644
--- a/scratch/mso-dumper/src/xlsstream.py
+++ b/scratch/mso-dumper/src/xlsstream.py
@@ -399,7 +399,7 @@ class XLDirStream(object):
     def __printSep (self, c='-', w=68, prefix=''):
         print(prefix + c*w)
 
-    def readRecord (self):
+    def __readRecordBytes (self):
         if self.size - self.pos < 4:
             raise EndOfStream
 
@@ -409,6 +409,45 @@ class XLDirStream(object):
             raise EndOfStream
         size = self.readRaw(2)
         bytes = self.readByteArray(size)
+        return pos, header, size, bytes
+
+    def __getRecordHandler (self, header, size, bytes):
+        # record handler that parses the raw bytes and displays more 
+        # meaningful information.
+        handler = None 
+        if recData.has_key(header) and len(recData[header]) >= 3:
+            handler = recData[header][2](header, size, bytes, self.strmData)
+
+        if handler != None and self.strmData.encrypted:
+            # record handler exists.  Parse the record and display more info 
+            # unless the stream is encrypted.
+            handler = None
+
+        return handler
+
+    def __postReadRecord (self, header):
+        if recData.has_key(header) and recData[header][0] == "FILEPASS":
+            # presence of FILEPASS record indicates that the stream is 
+            # encrypted.
+            self.strmData.encrypted = True
+
+    def fillModel (self, model):
+        pos, header, size, bytes = self.__readRecordBytes()
+        handler = self.__getRecordHandler(header, size, bytes)
+        if handler != None:
+            handler.fillModel(model)
+        self.__postReadRecord(header)
+
+
+    def readRecordXML (self):
+        pos, header, size, bytes = self.__readRecordBytes()
+        handler = self.__getRecordHandler(header, size, bytes)
+        print (recData[header][1])
+        self.__postReadRecord(header)
+        return header
+
+    def readRecord (self):
+        pos, header, size, bytes = self.__readRecordBytes()
 
         # record handler that parses the raw bytes and displays more 
         # meaningful information.
@@ -449,9 +488,5 @@ class XLDirStream(object):
             # unless the stream is encrypted.
             handler.output()
 
-        if recData.has_key(header) and recData[header][0] == "FILEPASS":
-            # presence of FILEPASS record indicates that the stream is 
-            # encrypted.
-            self.strmData.encrypted = True
-
+        self.__postReadRecord(header)
         return header
diff --git a/scratch/mso-dumper/xls-dump.py b/scratch/mso-dumper/xls-dump.py
index 9ec6e35..56e025b 100755
--- a/scratch/mso-dumper/xls-dump.py
+++ b/scratch/mso-dumper/xls-dump.py
@@ -2,7 +2,7 @@
 
 import sys, os.path, optparse
 sys.path.append(sys.path[0]+"/src")
-import ole, xlsstream, globals
+import ole, xlsstream, globals, node, xlsmodel
 
 from globals import error
 
@@ -31,7 +31,29 @@ class XLDumper(object):
         self.__parseFile()
         dirnames = self.strm.getDirectoryNames()
         for dirname in dirnames:
-            print (dirname)
+            if dirname != "Workbook":
+                # for now, we only dump the Workbook directory stream.
+                continue
+
+            dirstrm = self.strm.getDirectoryStreamByName(dirname)
+            self.__readSubStreamXML(dirstrm)
+
+    def dumpCanonicalXML (self):
+        self.__parseFile()
+        dirnames = self.strm.getDirectoryNames()
+        docroot = node.Root()
+        root = docroot.appendElement('xls-dump')
+
+        for dirname in dirnames:
+            if dirname != "Workbook":
+                # for now, we only dump the Workbook directory stream.
+                continue
+
+            dirstrm = self.strm.getDirectoryStreamByName(dirname)
+            wbmodel = self.__buildWorkbookModel(dirstrm)
+            root.appendChild(wbmodel.createDOM())
+
+        node.prettyPrint(sys.stdout, docroot)
 
     def dump (self):
         self.__parseFile()
@@ -75,17 +97,33 @@ class XLDumper(object):
         except xlsstream.EndOfStream:
             return False
 
+    def __readSubStreamXML (self, strm):
+        try:
+            while True:
+                strm.readRecordXML()
+        except xlsstream.EndOfStream:
+            pass
+
+    def __buildWorkbookModel (self, strm):
+        model = xlsmodel.Workbook()
+        try:
+            while True:
+                strm.fillModel(model)
+        except xlsstream.EndOfStream:
+            pass
+
+        return model
 
 def main ():
     parser = optparse.OptionParser()
     parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False,
-        help="turn on debug mode")
+        help="Turn on debug mode")
     parser.add_option("--show-sector-chain", action="store_true", dest="show_sector_chain", default=False,
-        help="show sector chain information at the start of the output.")
+        help="Show sector chain information at the start of the output.")
     parser.add_option("--show-stream-pos", action="store_true", dest="show_stream_pos", default=False,
-        help="show the position of each record relative to the stream.")
-    parser.add_option("--dump-xml", action="store_true", dest="dump_xml", default=False,
-        help="dump content in XML format.")
+        help="Show the position of each record relative to the stream.")
+    parser.add_option("--dump-mode", dest="dump_mode", default="flat", metavar="MODE",
+        help="Specify the dump mode.  Possible values are: 'flat', 'xml', or 'canonical-xml'.  The default value is 'flat'.")
     options, args = parser.parse_args()
     params = globals.Params()
     params.debug = options.debug
@@ -95,13 +133,19 @@ def main ():
     if len(args) < 1:
         globals.error("takes at least one argument\n")
         parser.print_help()
-        return
+        sys.exit(1)
 
     dumper = XLDumper(args[0], params)
-    if options.dump_xml:
+    if options.dump_mode == 'flat':
+        dumper.dump()
+    elif options.dump_mode == 'xml':
         dumper.dumpXML()
+    elif options.dump_mode == 'canonical-xml':
+        dumper.dumpCanonicalXML()
     else:
-        dumper.dump()
+        error("unknown dump mode: '%s'\n"%options.dump_mode)
+        parser.print_help()
+        sys.exit(1)
 
 if __name__ == '__main__':
     main()