[Libreoffice-commits] mso-dumper.git: 2 commits - Makefile msodumper/docdirstream.py msodumper/vsdstream.py vsd-dump.py

Miklos Vajna vmiklos at collabora.co.uk
Tue Nov 25 10:34:19 PST 2014


 Makefile                  |    1 
 msodumper/docdirstream.py |    2 
 msodumper/vsdstream.py    |  286 ++++++++++++++++++++++++++++++++++++++++++++++
 vsd-dump.py               |   44 +++++++
 4 files changed, 332 insertions(+), 1 deletion(-)

New commits:
commit e13cb64ab62501dbcd0dec98f042f46dd702bf56
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Tue Nov 25 19:27:36 2014 +0100

    Add initial SummaryInformation dumper

diff --git a/Makefile b/Makefile
index 1aa9120..47b42c9 100644
--- a/Makefile
+++ b/Makefile
@@ -2,3 +2,4 @@ check:
 	cd test/doc && ./test.py
 	pep8 --ignore=E501 doc-dump.py msodumper/doc{dirstream,record,sprm,stream}.py test/doc/test.py
 	pep8 --ignore=E501 emf-dump.py msodumper/{emf,wmf}record.py
+	pep8 --ignore=E501 vsd-dump.py msodumper/vsdstream.py
diff --git a/msodumper/vsdstream.py b/msodumper/vsdstream.py
new file mode 100644
index 0000000..36c279a
--- /dev/null
+++ b/msodumper/vsdstream.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python2
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+import ole
+import ctypes
+import struct
+from docdirstream import DOCDirStream
+import docrecord
+import globals
+import sys
+import os
+import bisect
+
+
+class VSDFile:
+    """Represents the whole visio file - feed will all bytes."""
+    def __init__(self, chars, params):
+        self.chars = chars
+        self.size = len(self.chars)
+        self.params = params
+        self.error = None
+
+        self.init()
+
+    def init(self):
+        self.header = ole.Header(self.chars, self.params)
+        self.pos = self.header.parse()
+
+    def __getDirectoryObj(self):
+        obj = self.header.getDirectory()
+        obj.parseDirEntries()
+        return obj
+
+    def getDirectoryNames(self):
+        return self.__getDirectoryObj().getDirectoryNames()
+
+    def getDirectoryStreamByName(self, name):
+        obj = self.__getDirectoryObj()
+        bytes = obj.getRawStreamByName(name)
+        return self.getStreamFromBytes(name, bytes)
+
+    def getStreamFromBytes(self, name, bytes):
+        if name == "\x05SummaryInformation":
+            return SummaryInformationStream(bytes, self.params, doc=self)
+        else:
+            return DOCDirStream(bytes, self.params, name, doc=self)
+
+    def getName(self):
+        return "native"
+
+
+class GsfVSDFile(VSDFile):
+    """Same as VSDFile, but uses gsf to read the OLE streams."""
+    def __init__(self, chars, params, gsf):
+        self.gsf = gsf
+        VSDFile.__init__(self, chars, params)
+
+    def disableStderr(self):
+        nil = os.open(os.devnull, os.O_WRONLY)
+        self.savedStderr = os.dup(2)
+        os.dup2(nil, 2)
+
+    def enableStderr(self):
+        os.dup2(self.savedStderr, 2)
+
+    def init(self):
+        self.streams = {}
+        self.gsf.gsf_init()
+        gsfInput = self.gsf.gsf_input_memory_new(self.chars, len(self.chars), False)
+        self.disableStderr()
+        gsfInfile = self.gsf.gsf_infile_msole_new(gsfInput, None)
+        self.enableStderr()
+        if not gsfInfile:
+            self.error = "gsf_infile_msole_new() failed"
+            return
+        for i in range(self.gsf.gsf_infile_num_children(gsfInfile)):
+            child = self.gsf.gsf_infile_child_by_index(gsfInfile, i)
+            childName = ctypes.string_at(self.gsf.gsf_infile_name_by_index(gsfInfile, i))
+            childSize = self.gsf.gsf_input_size(child)
+            childData = ""
+            while True:
+                bufSize = 1024
+                pos = self.gsf.gsf_input_tell(child)
+                if pos == childSize:
+                    break
+                elif pos + bufSize > childSize:
+                    bufSize = childSize - pos
+                childData += ctypes.string_at(self.gsf.gsf_input_read(child, bufSize, None), bufSize)
+            self.streams[childName] = childData
+        self.gsf.gsf_shutdown()
+
+    def getDirectoryNames(self):
+        return self.streams.keys()
+
+    def getDirectoryStreamByName(self, name):
+        return self.getStreamFromBytes(name, self.streams[name])
+
+    def getName(self):
+        return "gsf"
+
+
+def createVSDFile(chars, params):
+    hasGsf = True
+    try:
+        gsf = ctypes.cdll.LoadLibrary('libgsf-1.so')
+        gsf.gsf_input_read.restype = ctypes.c_void_p
+    except:
+        hasGsf = False
+
+    if hasGsf:
+        return GsfVSDFile(chars, params, gsf)
+    else:
+        return VSDFile(chars, params)
+
+
+class SummaryInformationStream(DOCDirStream):
+    def __init__(self, bytes, params, doc):
+        DOCDirStream.__init__(self, bytes, params, "\x05SummaryInformation", doc=doc)
+
+    def dump(self):
+        print '<stream name="\\x05SummaryInformation" size="%d">' % self.size
+        PropertySetStream(self).dump()
+        print '</stream>'
+
+
+class PropertySetStream(DOCDirStream):
+    def __init__(self, parent):
+        DOCDirStream.__init__(self, parent.bytes)
+        self.parent = parent
+
+    def dump(self):
+        print '<propertySetStream type="PropertySetStream" offset="%s">' % self.pos
+        self.printAndSet("ByteOrder", self.readuInt16())
+        self.printAndSet("Version", self.readuInt16())
+        self.printAndSet("SystemIdentifier", self.readuInt32())
+        self.printAndSet("CLSID0", self.readuInt32())
+        self.printAndSet("CLSID1", self.readuInt32())
+        self.printAndSet("CLSID2", self.readuInt32())
+        self.printAndSet("CLSID3", self.readuInt32())
+        self.printAndSet("NumPropertySets", self.readuInt32())
+        self.printAndSet("FMTID00", self.readuInt32())
+        self.printAndSet("FMTID01", self.readuInt32())
+        self.printAndSet("FMTID02", self.readuInt32())
+        self.printAndSet("FMTID03", self.readuInt32())
+        self.printAndSet("Offset0", self.readuInt32())
+        if self.NumPropertySets == 0x00000002:
+            print '<todo what="PropertySetStream::dump: handle NumPropertySets == 0x00000002"/>'
+        PropertySet(self).dump()
+        print '</propertySetStream>'
+
+
+class PropertySet(DOCDirStream):
+    def __init__(self, parent):
+        DOCDirStream.__init__(self, parent.bytes)
+        self.parent = parent
+        self.pos = parent.Offset0
+
+    def dump(self):
+        self.posOrig = self.pos
+        print '<propertySet type="PropertySet" offset="%s">' % self.pos
+        self.printAndSet("Size", self.readuInt32())
+        self.printAndSet("NumProperties", self.readuInt32())
+        self.idsAndOffsets = {}
+        for i in range(self.NumProperties):
+            self.idsAndOffsets[i] = PropertyIdentifierAndOffset(self, i)
+            self.idsAndOffsets[i].dump()
+        for i in range(self.NumProperties):
+            TypedPropertyValue(self, i).dump()
+        print '</propertySet>'
+
+PropertyIdentifier = {
+    0x00000001: "CODEPAGE_PROPERTY_IDENTIFIER",
+    0x00000002: "PIDSI_TITLE",
+    0x00000003: "PIDSI_SUBJECT",
+    0x00000004: "PIDSI_AUTHOR",
+    0x00000005: "PIDSI_KEYWORDS",
+    0x00000006: "PIDSI_COMMENTS",
+    0x00000007: "PIDSI_TEMPLATE",
+    0x00000008: "PIDSI_LASTAUTHOR",
+    0x00000009: "PIDSI_REVNUMBER",
+    0x0000000A: "PIDSI_EDITTIME",
+    0x0000000B: "PIDSI_LASTPRINTED",
+    0x0000000C: "PIDSI_CREATE_DTM",
+    0x0000000D: "PIDSI_LASTSAVE_DTM",
+    0x0000000E: "PIDSI_PAGECOUNT",
+    0x0000000F: "PIDSI_WORDCOUNT",
+    0x00000010: "PIDSI_CHARCOUNT",
+    0x00000011: "PIDSI_THUMBNAIL",
+    0x00000012: "PIDSI_APPNAME",
+    0x00000013: "PIDSI_DOC_SECURITY",
+}
+
+
+class PropertyIdentifierAndOffset(DOCDirStream):
+    def __init__(self, parent, index):
+        DOCDirStream.__init__(self, parent.bytes)
+        self.parent = parent
+        self.index = index
+        self.pos = parent.pos
+
+    def dump(self):
+        print '<propertyIdentifierAndOffset%s type="PropertyIdentifierAndOffset" offset="%s">' % (self.index, self.pos)
+        self.printAndSet("PropertyIdentifier", self.readuInt32(), dict=PropertyIdentifier)
+        self.printAndSet("Offset", self.readuInt32())
+        print '</propertyIdentifierAndOffset%s>' % self.index
+        self.parent.pos = self.pos
+
+PropertyType = {
+    0x0000: "VT_EMPTY",
+    0x0001: "VT_NULL",
+    0x0002: "VT_I2",
+    0x0003: "VT_I4",
+    0x0004: "VT_R4",
+    0x0005: "VT_R8",
+    0x0006: "VT_CY",
+    0x0007: "VT_DATE",
+    0x0008: "VT_BSTR",
+    0x000A: "VT_ERROR",
+    0x000B: "VT_BOOL",
+    0x000E: "VT_DECIMAL",
+    0x0010: "VT_I1",
+    0x0011: "VT_UI1",
+    0x0012: "VT_UI2",
+    0x0013: "VT_UI4",
+    0x0014: "VT_I8",
+    0x0015: "VT_UI8",
+    0x0016: "VT_INT",
+    0x0017: "VT_UINT",
+    0x001E: "VT_LPSTR",
+    0x001F: "VT_LPWSTR",
+    0x0040: "VT_FILETIME",
+    0x0041: "VT_BLOB",
+    0x0042: "VT_STREAM",
+    0x0043: "VT_STORAGE",
+    0x0044: "VT_STREAMED_Object",
+    0x0045: "VT_STORED_Object",
+    0x0046: "VT_BLOB_Object",
+    0x0047: "VT_CF",
+    0x0048: "VT_CLSID",
+    0x0049: "VT_VERSIONED_STREAM",
+}
+
+
+class TypedPropertyValue(DOCDirStream):
+    def __init__(self, parent, index):
+        DOCDirStream.__init__(self, parent.bytes)
+        self.parent = parent
+        self.index = index
+        self.pos = parent.posOrig + parent.idsAndOffsets[index].Offset
+
+    def dump(self):
+        print '<typedPropertyValue%s type="TypedPropertyValue" offset="%s">' % (self.index, self.pos)
+        self.printAndSet("Type", self.readuInt16(), dict=PropertyType)
+        self.printAndSet("Padding", self.readuInt16())
+        if self.Type == 0x0002:  # VT_I2
+            self.printAndSet("Value", self.readInt16())
+        elif self.Type == 0x001E:  # VT_LPSTR
+            CodePageString(self, "Value").dump()
+        else:
+            print '<todo what="TypedPropertyValue::dump: unhandled Type %s"/>' % hex(self.Type)
+        print '</typedPropertyValue%s>' % self.index
+
+
+class CodePageString(DOCDirStream):
+    def __init__(self, parent, name):
+        DOCDirStream.__init__(self, parent.bytes)
+        self.pos = parent.pos
+        self.name = name
+
+    def dump(self):
+        print '<%s type="CodePageString">' % self.name
+        self.printAndSet("Size", self.readuInt32())
+        bytes = []
+        for i in range(self.Size):
+            c = self.readuInt8()
+            if c == 0:
+                break
+            bytes.append(c)
+        print '<Characters value="%s"/>' % "".join(map(lambda c: chr(c), bytes))
+        print '</%s>' % self.name
+
+# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
diff --git a/vsd-dump.py b/vsd-dump.py
new file mode 100755
index 0000000..9d56f8f
--- /dev/null
+++ b/vsd-dump.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python2
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+import sys
+sys = reload(sys)
+sys.setdefaultencoding("utf-8")
+
+from msodumper import globals, vsdstream
+
+
+class VSDDumper:
+    def __init__(self, filepath, params):
+        self.filepath = filepath
+        self.params = params
+
+    def dump(self):
+        file = open(self.filepath, 'rb')
+        strm = vsdstream.createVSDFile(file.read(), self.params)
+        file.close()
+        dirnames = strm.getDirectoryNames()
+        print '<?xml version="1.0"?>\n<streams ole-type="%s">' % strm.getName()
+        if strm.error:
+            print '<error what="%s"/>' % strm.error
+        for dirname in dirnames:
+            if len(dirname) == 0 or dirname in ['Root Entry']:
+                continue
+            strm.getDirectoryStreamByName(dirname).dump()
+        print '</streams>'
+
+
+def main(args):
+    exname, args = args[0], args[1:]
+    params = globals.Params()
+    dumper = VSDDumper(args[0], params)
+    dumper.dump()
+
+if __name__ == '__main__':
+    main(sys.argv)
+
+# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
commit 50a4c3e2478bf80a544e00164d0dfdda687587b8
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Tue Nov 25 19:26:16 2014 +0100

    test for object identity should be 'is not'

diff --git a/msodumper/docdirstream.py b/msodumper/docdirstream.py
index c9aa3d5..88a91d6 100644
--- a/msodumper/docdirstream.py
+++ b/msodumper/docdirstream.py
@@ -138,7 +138,7 @@ class DOCDirStream:
         count = 0
         pos = self.pos
         while True:
-            if (not limit is None) and count == limit:
+            if (limit is not None) and count == limit:
                 break
             i = self.getuInt8(pos=pos)
             pos += 1


More information about the Libreoffice-commits mailing list