[Libreoffice-commits] mso-dumper.git: 7 commits - msodumper/vsdstream.py

Miklos Vajna vmiklos at collabora.co.uk
Wed Nov 26 04:11:35 PST 2014


 msodumper/vsdstream.py |  202 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 164 insertions(+), 38 deletions(-)

New commits:
commit 18e52f3e1c61d1d644b427cd354d694f6a457f34
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Wed Nov 26 12:46:20 2014 +0100

    VSD: start dumping the user-defined set

diff --git a/msodumper/vsdstream.py b/msodumper/vsdstream.py
index 8dcd1bf..d2403a3 100644
--- a/msodumper/vsdstream.py
+++ b/msodumper/vsdstream.py
@@ -209,19 +209,22 @@ class PropertySetStream(DOCDirStream):
         self.printAndSet("CLSID2", self.readuInt32())
         self.printAndSet("CLSID3", self.readuInt32())
         self.printAndSet("NumPropertySets", self.readuInt32())
-        GUID(self, "FMTID").dump()
+        GUID(self, "FMTID0").dump()
         self.printAndSet("Offset0", self.readuInt32())
+        PropertySet(self, self.Offset0).dump()
         if self.NumPropertySets == 0x00000002:
-            print '<todo what="PropertySetStream::dump: handle NumPropertySets == 0x00000002"/>'
-        PropertySet(self).dump()
+            GUID(self, "FMTID1").dump()
+            self.printAndSet("Offset1", self.readuInt32())
+            self.propertyIds = {}
+            PropertySet(self, self.Offset1).dump()
         print '</propertySetStream>'
 
 
 class PropertySet(DOCDirStream):
-    def __init__(self, parent):
+    def __init__(self, parent, offset):
         DOCDirStream.__init__(self, parent.bytes)
         self.parent = parent
-        self.pos = parent.Offset0
+        self.pos = offset
 
     def getCodePage(self):
         for index, idAndOffset in enumerate(self.idsAndOffsets):
commit c2cd85eddf02c74e0f26955e6e847ebc61d79593
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Wed Nov 26 12:36:52 2014 +0100

    VSD: dump PIDDSI

diff --git a/msodumper/vsdstream.py b/msodumper/vsdstream.py
index 380c7ef..8dcd1bf 100644
--- a/msodumper/vsdstream.py
+++ b/msodumper/vsdstream.py
@@ -119,7 +119,34 @@ def createVSDFile(chars, params):
         return VSDFile(chars, params)
 
 
-PropertyIdentifierDocumentSummaryInformation = {
+PIDDSI = {
+    0x00000001: "PIDDSI_CODEPAGE",
+    0x00000002: "PIDDSI_CATEGORY",
+    0x00000003: "PIDDSI_PRESFORMAT",
+    0x00000004: "PIDDSI_BYTECOUNT",
+    0x00000005: "PIDDSI_LINECOUNT",
+    0x00000006: "PIDDSI_PARACOUNT",
+    0x00000007: "PIDDSI_SLIDECOUNT",
+    0x00000008: "PIDDSI_NOTECOUNT",
+    0x00000009: "PIDDSI_HIDDENCOUNT",
+    0x0000000A: "PIDDSI_MMCLIPCOUNT",
+    0x0000000B: "PIDDSI_SCALE",
+    0x0000000C: "PIDDSI_HEADINGPAIR",
+    0x0000000D: "PIDDSI_DOCPARTS",
+    0x0000000E: "PIDDSI_MANAGER",
+    0x0000000F: "PIDDSI_COMPANY",
+    0x00000010: "PIDDSI_LINKSDIRTY",
+    0x00000011: "PIDDSI_CCHWITHSPACES",
+    0x00000013: "PIDDSI_SHAREDDOC",
+    0x00000014: "PIDDSI_LINKBASE",
+    0x00000015: "PIDDSI_HLINKS",
+    0x00000016: "PIDDSI_HYPERLINKSCHANGED",
+    0x00000017: "PIDDSI_VERSION",
+    0x00000018: "PIDDSI_DIGSIG",
+    0x0000001A: "PIDDSI_CONTENTTYPE",
+    0x0000001B: "PIDDSI_CONTENTSTATUS",
+    0x0000001C: "PIDDSI_LANGUAGE",
+    0x0000001D: "PIDDSI_DOCVERSION",
 }
 
 
@@ -129,11 +156,11 @@ class DocumentSummaryInformationStream(DOCDirStream):
 
     def dump(self):
         print '<stream name="\\x05DocumentSummaryInformation" size="%d">' % self.size
-        PropertySetStream(self, PropertyIdentifierDocumentSummaryInformation).dump()
+        PropertySetStream(self, PIDDSI).dump()
         print '</stream>'
 
 
-PropertyIdentifierSummaryInformation = {
+PIDSI = {
     0x00000001: "CODEPAGE_PROPERTY_IDENTIFIER",
     0x00000002: "PIDSI_TITLE",
     0x00000003: "PIDSI_SUBJECT",
@@ -162,7 +189,7 @@ class SummaryInformationStream(DOCDirStream):
 
     def dump(self):
         print '<stream name="\\x05SummaryInformation" size="%d">' % self.size
-        PropertySetStream(self, PropertyIdentifierSummaryInformation).dump()
+        PropertySetStream(self, PIDSI).dump()
         print '</stream>'
 
 
commit 7432d469dd0333bdb7156484c6fb32fb03e1d173
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Wed Nov 26 11:17:04 2014 +0100

    VSD: dump GUIDs properly

diff --git a/msodumper/vsdstream.py b/msodumper/vsdstream.py
index 481f086..380c7ef 100644
--- a/msodumper/vsdstream.py
+++ b/msodumper/vsdstream.py
@@ -182,10 +182,7 @@ class PropertySetStream(DOCDirStream):
         self.printAndSet("CLSID2", self.readuInt32())
         self.printAndSet("CLSID3", self.readuInt32())
         self.printAndSet("NumPropertySets", self.readuInt32())
-        self.printAndSet("FMTID00", self.readuInt32())
-        self.printAndSet("FMTID01", self.readuInt32())
-        self.printAndSet("FMTID02", self.readuInt32())
-        self.printAndSet("FMTID03", self.readuInt32())
+        GUID(self, "FMTID").dump()
         self.printAndSet("Offset0", self.readuInt32())
         if self.NumPropertySets == 0x00000002:
             print '<todo what="PropertySetStream::dump: handle NumPropertySets == 0x00000002"/>'
@@ -363,4 +360,23 @@ class CodePageString(DOCDirStream):
             print '<todo what="CodePageString::dump: unhandled codepage %s"/>' % codepage
         print '</%s>' % self.name
 
+
+class GUID(DOCDirStream):
+    def __init__(self, parent, name):
+        DOCDirStream.__init__(self, parent.bytes)
+        self.pos = parent.pos
+        self.parent = parent
+        self.name = name
+
+    def dump(self):
+        Data1 = self.readuInt32()
+        Data2 = self.readuInt16()
+        Data3 = self.readuInt16()
+        Data4 = []
+        for i in range(8):
+            Data4.append(self.readuInt8())
+        value = "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x" % (Data1, Data2, Data3, Data4[0], Data4[1], Data4[2], Data4[3], Data4[4], Data4[5], Data4[6], Data4[7])
+        print '<%s type="GUID" value="%s"/>' % (self.name, value)
+        self.parent.pos = self.pos
+
 # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
commit c4c94fa5d494c105f17d41e3d5e1e3973cf44e8c
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Wed Nov 26 10:37:52 2014 +0100

    vsd: complete PropertyType enumeration

diff --git a/msodumper/vsdstream.py b/msodumper/vsdstream.py
index 5132c87..481f086 100644
--- a/msodumper/vsdstream.py
+++ b/msodumper/vsdstream.py
@@ -119,12 +119,17 @@ def createVSDFile(chars, params):
         return VSDFile(chars, params)
 
 
+PropertyIdentifierDocumentSummaryInformation = {
+}
+
+
 class DocumentSummaryInformationStream(DOCDirStream):
     def __init__(self, bytes, params, doc):
         DOCDirStream.__init__(self, bytes, params, "\x05DocumentSummaryInformation", doc=doc)
 
     def dump(self):
         print '<stream name="\\x05DocumentSummaryInformation" size="%d">' % self.size
+        PropertySetStream(self, PropertyIdentifierDocumentSummaryInformation).dump()
         print '</stream>'
 
 
@@ -264,6 +269,45 @@ PropertyType = {
     0x0047: "VT_CF",
     0x0048: "VT_CLSID",
     0x0049: "VT_VERSIONED_STREAM",
+    0x1002: "VT_VECTOR | VT_I2",
+    0x1003: "VT_VECTOR | VT_I4",
+    0x1004: "VT_VECTOR | VT_R4",
+    0x1005: "VT_VECTOR | VT_R8",
+    0x1006: "VT_VECTOR | VT_CY",
+    0x1007: "VT_VECTOR | VT_DATE",
+    0x1008: "VT_VECTOR | VT_BSTR",
+    0x100A: "VT_VECTOR | VT_ERROR",
+    0x100B: "VT_VECTOR | VT_BOOL",
+    0x100C: "VT_VECTOR | VT_VARIANT",
+    0x1010: "VT_VECTOR | VT_I1",
+    0x1011: "VT_VECTOR | VT_UI1",
+    0x1012: "VT_VECTOR | VT_UI2",
+    0x1013: "VT_VECTOR | VT_UI4",
+    0x1014: "VT_VECTOR | VT_I8",
+    0x1015: "VT_VECTOR | VT_UI8",
+    0x101E: "VT_VECTOR | VT_LPSTR",
+    0x101F: "VT_VECTOR | VT_LPWSTR",
+    0x1040: "VT_VECTOR | VT_FILETIME",
+    0x1047: "VT_VECTOR | VT_CF",
+    0x1048: "VT_VECTOR | VT_CLSID",
+    0x2002: "VT_ARRAY | VT_I2",
+    0x2003: "VT_ARRAY | VT_I4",
+    0x2004: "VT_ARRAY | VT_R4",
+    0x2005: "VT_ARRAY | VT_R8",
+    0x2006: "VT_ARRAY | VT_CY",
+    0x2007: "VT_ARRAY | VT_DATE",
+    0x2008: "VT_ARRAY | VT_BSTR",
+    0x200A: "VT_ARRAY | VT_ERROR",
+    0x200B: "VT_ARRAY | VT_BOOL",
+    0x200C: "VT_ARRAY | VT_VARIANT",
+    0x200E: "VT_ARRAY | VT_DECIMAL",
+    0x2010: "VT_ARRAY | VT_I1",
+    0x2011: "VT_ARRAY | VT_UI1",
+    0x2012: "VT_ARRAY | VT_UI2",
+    0x2013: "VT_ARRAY | VT_UI4",
+    0x2016: "VT_ARRAY | VT_INT",
+    0x2017: "VT_ARRAY | VT_UINT",
+
 }
 
 
commit afbd91f0a3d180def770c4a070ddbd57cc6849c8
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Wed Nov 26 10:25:58 2014 +0100

    vsd: PIDSI is specific to the SummaryInformation stream

diff --git a/msodumper/vsdstream.py b/msodumper/vsdstream.py
index f91380c..5132c87 100644
--- a/msodumper/vsdstream.py
+++ b/msodumper/vsdstream.py
@@ -46,6 +46,8 @@ class VSDFile:
     def getStreamFromBytes(self, name, bytes):
         if name == "\x05SummaryInformation":
             return SummaryInformationStream(bytes, self.params, doc=self)
+        elif name == "\x05DocumentSummaryInformation":
+            return DocumentSummaryInformationStream(bytes, self.params, doc=self)
         else:
             return DOCDirStream(bytes, self.params, name, doc=self)
 
@@ -117,20 +119,53 @@ def createVSDFile(chars, params):
         return VSDFile(chars, params)
 
 
+class DocumentSummaryInformationStream(DOCDirStream):
+    def __init__(self, bytes, params, doc):
+        DOCDirStream.__init__(self, bytes, params, "\x05DocumentSummaryInformation", doc=doc)
+
+    def dump(self):
+        print '<stream name="\\x05DocumentSummaryInformation" size="%d">' % self.size
+        print '</stream>'
+
+
+PropertyIdentifierSummaryInformation = {
+    0x00000001: "CODEPAGE_PROPERTY_IDENTIFIER",
+    0x00000002: "PIDSI_TITLE",
+    0x00000003: "PIDSI_SUBJECT",
+    0x00000004: "PIDSI_AUTHOR",
+    0x00000005: "PIDSI_KEYWORDS",
+    0x00000006: "PIDSI_COMMENTS",
+    0x00000007: "PIDSI_TEMPLATE",
+    0x00000008: "PIDSI_LASTAUTHOR",
+    0x00000009: "PIDSI_REVNUMBER",
+    0x0000000A: "PIDSI_EDITTIME",
+    0x0000000B: "PIDSI_LASTPRINTED",
+    0x0000000C: "PIDSI_CREATE_DTM",
+    0x0000000D: "PIDSI_LASTSAVE_DTM",
+    0x0000000E: "PIDSI_PAGECOUNT",
+    0x0000000F: "PIDSI_WORDCOUNT",
+    0x00000010: "PIDSI_CHARCOUNT",
+    0x00000011: "PIDSI_THUMBNAIL",
+    0x00000012: "PIDSI_APPNAME",
+    0x00000013: "PIDSI_DOC_SECURITY",
+}
+
+
 class SummaryInformationStream(DOCDirStream):
     def __init__(self, bytes, params, doc):
         DOCDirStream.__init__(self, bytes, params, "\x05SummaryInformation", doc=doc)
 
     def dump(self):
         print '<stream name="\\x05SummaryInformation" size="%d">' % self.size
-        PropertySetStream(self).dump()
+        PropertySetStream(self, PropertyIdentifierSummaryInformation).dump()
         print '</stream>'
 
 
 class PropertySetStream(DOCDirStream):
-    def __init__(self, parent):
+    def __init__(self, parent, PropertyIds):
         DOCDirStream.__init__(self, parent.bytes)
         self.parent = parent
+        self.propertyIds = PropertyIds
 
     def dump(self):
         print '<propertySetStream type="PropertySetStream" offset="%s">' % self.pos
@@ -181,28 +216,6 @@ class PropertySet(DOCDirStream):
             self.typedPropertyValues.append(typedPropertyValue)
         print '</propertySet>'
 
-PropertyIdentifier = {
-    0x00000001: "CODEPAGE_PROPERTY_IDENTIFIER",
-    0x00000002: "PIDSI_TITLE",
-    0x00000003: "PIDSI_SUBJECT",
-    0x00000004: "PIDSI_AUTHOR",
-    0x00000005: "PIDSI_KEYWORDS",
-    0x00000006: "PIDSI_COMMENTS",
-    0x00000007: "PIDSI_TEMPLATE",
-    0x00000008: "PIDSI_LASTAUTHOR",
-    0x00000009: "PIDSI_REVNUMBER",
-    0x0000000A: "PIDSI_EDITTIME",
-    0x0000000B: "PIDSI_LASTPRINTED",
-    0x0000000C: "PIDSI_CREATE_DTM",
-    0x0000000D: "PIDSI_LASTSAVE_DTM",
-    0x0000000E: "PIDSI_PAGECOUNT",
-    0x0000000F: "PIDSI_WORDCOUNT",
-    0x00000010: "PIDSI_CHARCOUNT",
-    0x00000011: "PIDSI_THUMBNAIL",
-    0x00000012: "PIDSI_APPNAME",
-    0x00000013: "PIDSI_DOC_SECURITY",
-}
-
 
 class PropertyIdentifierAndOffset(DOCDirStream):
     def __init__(self, parent, index):
@@ -213,7 +226,7 @@ class PropertyIdentifierAndOffset(DOCDirStream):
 
     def dump(self):
         print '<propertyIdentifierAndOffset%s type="PropertyIdentifierAndOffset" offset="%s">' % (self.index, self.pos)
-        self.printAndSet("PropertyIdentifier", self.readuInt32(), dict=PropertyIdentifier)
+        self.printAndSet("PropertyIdentifier", self.readuInt32(), dict=self.parent.parent.propertyIds)
         self.printAndSet("Offset", self.readuInt32())
         print '</propertyIdentifierAndOffset%s>' % self.index
         self.parent.pos = self.pos
commit 84940f4150fa56256ddff631fe6dc671e10f5b93
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Wed Nov 26 09:59:03 2014 +0100

    vsdstream: dump utf8 titles

diff --git a/msodumper/vsdstream.py b/msodumper/vsdstream.py
index 4e99aa5..f91380c 100644
--- a/msodumper/vsdstream.py
+++ b/msodumper/vsdstream.py
@@ -290,12 +290,20 @@ class CodePageString(DOCDirStream):
             if c == 0:
                 break
             bytes.append(c)
+        codepage = self.parent.parent.getCodePage()
+        if codepage < 0:
+            codepage += 2 ** 16  # signed -> unsigned
         encoding = ""
-        if self.parent.parent.getCodePage() == 1252:
+        if codepage == 1252:
             # http://msdn.microsoft.com/en-us/goglobal/bb964654
             encoding = "latin1"
+        elif codepage == 65001:
+            # http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130%28v=vs.85%29.aspx
+            encoding = "utf-8"
         if len(encoding):
             print '<Characters value="%s"/>' % "".join(map(lambda c: chr(c), bytes)).decode(encoding).encode('utf-8')
+        else:
+            print '<todo what="CodePageString::dump: unhandled codepage %s"/>' % codepage
         print '</%s>' % self.name
 
 # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
commit ae210089c3a5d6dd0932564c6f600be88739db45
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Wed Nov 26 09:41:38 2014 +0100

    vsdstream: dump latin1 titles

diff --git a/msodumper/vsdstream.py b/msodumper/vsdstream.py
index 36c279a..4e99aa5 100644
--- a/msodumper/vsdstream.py
+++ b/msodumper/vsdstream.py
@@ -159,17 +159,26 @@ class PropertySet(DOCDirStream):
         self.parent = parent
         self.pos = parent.Offset0
 
+    def getCodePage(self):
+        for index, idAndOffset in enumerate(self.idsAndOffsets):
+            if idAndOffset.PropertyIdentifier == 0x00000001:  # CODEPAGE_PROPERTY_IDENTIFIER
+                return self.typedPropertyValues[index].Value
+
     def dump(self):
         self.posOrig = self.pos
         print '<propertySet type="PropertySet" offset="%s">' % self.pos
         self.printAndSet("Size", self.readuInt32())
         self.printAndSet("NumProperties", self.readuInt32())
-        self.idsAndOffsets = {}
+        self.idsAndOffsets = []
         for i in range(self.NumProperties):
-            self.idsAndOffsets[i] = PropertyIdentifierAndOffset(self, i)
-            self.idsAndOffsets[i].dump()
+            idAndOffset = PropertyIdentifierAndOffset(self, i)
+            idAndOffset.dump()
+            self.idsAndOffsets.append(idAndOffset)
+        self.typedPropertyValues = []
         for i in range(self.NumProperties):
-            TypedPropertyValue(self, i).dump()
+            typedPropertyValue = TypedPropertyValue(self, i)
+            typedPropertyValue.dump()
+            self.typedPropertyValues.append(typedPropertyValue)
         print '</propertySet>'
 
 PropertyIdentifier = {
@@ -269,6 +278,7 @@ class CodePageString(DOCDirStream):
     def __init__(self, parent, name):
         DOCDirStream.__init__(self, parent.bytes)
         self.pos = parent.pos
+        self.parent = parent
         self.name = name
 
     def dump(self):
@@ -280,7 +290,12 @@ class CodePageString(DOCDirStream):
             if c == 0:
                 break
             bytes.append(c)
-        print '<Characters value="%s"/>' % "".join(map(lambda c: chr(c), bytes))
+        encoding = ""
+        if self.parent.parent.getCodePage() == 1252:
+            # http://msdn.microsoft.com/en-us/goglobal/bb964654
+            encoding = "latin1"
+        if len(encoding):
+            print '<Characters value="%s"/>' % "".join(map(lambda c: chr(c), bytes)).decode(encoding).encode('utf-8')
         print '</%s>' % self.name
 
 # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:


More information about the Libreoffice-commits mailing list