[Libreoffice-commits] mso-dumper.git: 10 commits - doc-dump.py emf-dump.py Makefile msodumper/docdirstream.py msodumper/docrecord.py msodumper/docstream.py msodumper/msometa.py msodumper/vsdstream.py test/doc
Miklos Vajna
vmiklos at collabora.co.uk
Fri Apr 17 06:31:49 PDT 2015
Makefile | 1
doc-dump.py | 3
emf-dump.py | 3
msodumper/docdirstream.py | 3
msodumper/docrecord.py | 7 -
msodumper/docstream.py | 8 +
msodumper/msometa.py | 314 ++++++++++++++++++++++++++++++++++++++++++++++
msodumper/vsdstream.py | 293 ------------------------------------------
test/doc/test.py | 6
9 files changed, 337 insertions(+), 301 deletions(-)
New commits:
commit 34d3f8c16849f5a4c91d2a350f81a8c78f1539e7
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 15:31:25 2015 +0200
DefTableShd80Operand.dump: Unused variable 'i'
diff --git a/msodumper/docrecord.py b/msodumper/docrecord.py
index 1586c48..d904d34 100644
--- a/msodumper/docrecord.py
+++ b/msodumper/docrecord.py
@@ -624,13 +624,14 @@ class Shd80(DOCDirStream):
"""The Shd80 structure specifies the colors and pattern that are used for background shading."""
size = 2 # in bytes, see 2.9.245
- def __init__(self, parent):
+ def __init__(self, parent, index):
DOCDirStream.__init__(self, parent.bytes)
self.pos = parent.pos
self.parent = parent
+ self.index = index
def dump(self):
- print '<shd80 type="Shd80" offset="%d">' % self.pos
+ print '<shd80 type="Shd80" offset="%d" index="%d">' % (self.pos, self.index)
buf = self.readuInt16()
self.printAndSet("icoFore", buf & 0x001f, dict=Ico) # 1..5th bits
self.printAndSet("icoBack", (buf & 0x03e0) >> 5, dict=Ico) # 6..10th bits
@@ -650,7 +651,7 @@ class DefTableShd80Operand(DOCDirStream):
print '<defTableShd80Operand type="DefTableShd80Operand" offset="%d">' % self.pos
self.printAndSet("cb", self.readuInt8())
for i in xrange(self.cb / Shd80.size):
- Shd80(self).dump()
+ Shd80(self, i).dump()
print '</defTableShd80Operand>'
commit 06ab6a09468b8400bc1622f055505bb640c620b6
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 15:29:15 2015 +0200
DOCDirStream.getString: Unused variable 'pos'
diff --git a/msodumper/docdirstream.py b/msodumper/docdirstream.py
index 88a91d6..7b9e8de 100644
--- a/msodumper/docdirstream.py
+++ b/msodumper/docdirstream.py
@@ -153,8 +153,7 @@ class DOCDirStream:
return (self.quoteAttr(globals.encodeName(globals.getUTF8FromUTF16("".join(map(lambda x: chr(x), bytes))))), pos)
def getString(self, limit=None):
- ret, pos = self.__getString(limit)
- return ret
+ return self.__getString(limit)[0]
def readString(self, limit=None):
ret, pos = self.__getString(limit)
commit f7031b02e6d97fd926d847397050c5736d9013e1
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 15:28:45 2015 +0200
emf-dump: Unused variable 'exname'
diff --git a/emf-dump.py b/emf-dump.py
index 1023a9f..1e8f3d3 100755
--- a/emf-dump.py
+++ b/emf-dump.py
@@ -25,8 +25,7 @@ class EMFDumper:
def main(args):
- exname, args = args[0], args[1:]
- dumper = EMFDumper(args[0])
+ dumper = EMFDumper(args[1])
dumper.dump()
if __name__ == '__main__':
commit f6a9a2b35c10b967efda2d792686dc10899268e7
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 15:28:06 2015 +0200
doc-dump: Unused variable 'exname'
diff --git a/doc-dump.py b/doc-dump.py
index ab57719..a030fdd 100755
--- a/doc-dump.py
+++ b/doc-dump.py
@@ -33,9 +33,8 @@ class DOCDumper:
def main(args):
- exname, args = args[0], args[1:]
params = globals.Params()
- dumper = DOCDumper(args[0], params)
+ dumper = DOCDumper(args[1], params)
dumper.dump()
if __name__ == '__main__':
commit f1f9ecccc164a48fc29244d45269ef913c01b12f
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 15:11:23 2015 +0200
docstream: hook into msometa
diff --git a/msodumper/docstream.py b/msodumper/docstream.py
index a30859f..b455cea 100644
--- a/msodumper/docstream.py
+++ b/msodumper/docstream.py
@@ -14,6 +14,8 @@ import globals
import sys
import os
import bisect
+from msometa import SummaryInformationStream
+from msometa import DocumentSummaryInformationStream
class DOCFile:
@@ -54,8 +56,12 @@ class DOCFile:
def getStreamFromBytes(self, name, bytes):
if name == "WordDocument":
return WordDocumentStream(bytes, self.params, doc=self)
- if name in ("0Table", "1Table"):
+ elif name in ("0Table", "1Table"):
return TableStream(bytes, self.params, name, doc=self)
+ elif name == "\x05SummaryInformation":
+ return SummaryInformationStream(bytes, self.params, doc=self)
+ elif name == "\x05DocumentSummaryInformation":
+ return DocumentSummaryInformationStream(bytes, self.params, doc=self)
else:
return DOCDirStream(bytes, self.params, name, doc=self)
diff --git a/test/doc/test.py b/test/doc/test.py
index c63bc11..55d9819 100755
--- a/test/doc/test.py
+++ b/test/doc/test.py
@@ -157,6 +157,12 @@ class Test(unittest.TestCase):
# Zoom is 42%
self.assertEqual('0x2a', dopBase.findall('pctWwdSaved')[0].attrib['value'])
+ # Assert metadata: who is the author.
+ propertyIdentifier = self.root.findall('stream[@name="\\x05SummaryInformation"]/propertySetStream/propertySet/propertyIdentifierAndOffset3/PropertyIdentifier')[0]
+ self.assertEqual('PIDSI_AUTHOR', propertyIdentifier.attrib["name"])
+ typedPropertyValue = self.root.findall('stream[@name="\\x05SummaryInformation"]/propertySetStream/propertySet/typedPropertyValue3/Value/Characters')[0]
+ self.assertEqual('vmiklos', typedPropertyValue.attrib["value"])
+
def test_nofibnew(self):
self.dump('nofibnew')
commit 4cf732e24a768f161abded0a1caf5fd73185e293
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 15:10:57 2015 +0200
msometa: handle latin2 encoding
diff --git a/msodumper/msometa.py b/msodumper/msometa.py
index 5ee0228..2afd8fa 100644
--- a/msodumper/msometa.py
+++ b/msodumper/msometa.py
@@ -275,12 +275,14 @@ class CodePageString(DOCDirStream):
break
bytes.append(c)
codepage = self.parent.parent.getCodePage()
- if codepage < 0:
+ if (codepage is not None) and (codepage < 0):
codepage += 2 ** 16 # signed -> unsigned
encoding = ""
+ # http://msdn.microsoft.com/en-us/goglobal/bb964654
if codepage == 1252:
- # http://msdn.microsoft.com/en-us/goglobal/bb964654
encoding = "latin1"
+ elif codepage == 1250:
+ encoding = "latin2"
elif codepage == 65001:
# http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130%28v=vs.85%29.aspx
encoding = "utf-8"
commit 118fb2945bda2c626cc1391fc4e1eb87d872d24b
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 15:09:54 2015 +0200
msometa: don't crash on invalid offset
As seen in gnome169822-1.doc.
diff --git a/msodumper/msometa.py b/msodumper/msometa.py
index b611b0f..5ee0228 100644
--- a/msodumper/msometa.py
+++ b/msodumper/msometa.py
@@ -129,6 +129,9 @@ class PropertySet(DOCDirStream):
return self.typedPropertyValues[index].Value
def dump(self):
+ if self.pos > self.size:
+ return
+
self.posOrig = self.pos
print '<propertySet type="PropertySet" offset="%s">' % self.pos
self.printAndSet("Size", self.readuInt32())
commit 235dddc3178b583f8b68f327949d0be98c097afc
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 15:05:52 2015 +0200
msometa: encode low characters to avoid not-well-formed output
Again in the dump of abi2017-1.doc.
diff --git a/msodumper/msometa.py b/msodumper/msometa.py
index 5dc2792..b611b0f 100644
--- a/msodumper/msometa.py
+++ b/msodumper/msometa.py
@@ -282,7 +282,7 @@ class CodePageString(DOCDirStream):
# http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130%28v=vs.85%29.aspx
encoding = "utf-8"
if len(encoding):
- print '<Characters value="%s"/>' % "".join(map(lambda c: chr(c), bytes)).decode(encoding).encode('utf-8')
+ print '<Characters value="%s"/>' % globals.encodeName("".join(map(lambda c: chr(c), bytes)).decode(encoding), lowOnly=True).encode('utf-8')
else:
print '<todo what="CodePageString::dump: unhandled codepage %s"/>' % codepage
print '</%s>' % self.name
commit a92a7b612db026f9103b2f232abf9b2837b3248f
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 14:59:46 2015 +0200
msometa: don't fail when the ID is not a well-known one
Fixes dump of abi2017-1.doc.
diff --git a/msodumper/msometa.py b/msodumper/msometa.py
index 8b87284..5dc2792 100644
--- a/msodumper/msometa.py
+++ b/msodumper/msometa.py
@@ -155,7 +155,7 @@ class PropertyIdentifierAndOffset(DOCDirStream):
def dump(self):
print '<propertyIdentifierAndOffset%s type="PropertyIdentifierAndOffset" offset="%s">' % (self.index, self.pos)
- self.printAndSet("PropertyIdentifier", self.readuInt32(), dict=self.parent.parent.propertyIds)
+ self.printAndSet("PropertyIdentifier", self.readuInt32(), dict=self.parent.parent.propertyIds, default="unknown")
self.printAndSet("Offset", self.readuInt32())
print '</propertyIdentifierAndOffset%s>' % self.index
self.parent.pos = self.pos
commit c6d3a368e8645997e9637d08307ff37e0b906682
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date: Fri Apr 17 12:44:10 2015 +0200
Factor out msometa from vsdstream
So that it can be reused for DOC.
diff --git a/Makefile b/Makefile
index ef91870..768c967 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
check:
cd test/doc && ./test.py
+ pep8 --ignore=E501 msodumper/msometa.py
pep8 --ignore=E501 doc-dump.py msodumper/doc{dirstream,record,sprm,stream}.py test/doc/test.py
pep8 --ignore=E501 emf-dump.py msodumper/{emf,wmf}record.py
pep8 --ignore=E501 vsd-dump.py msodumper/vsdstream.py test/vsd-test.py
diff --git a/msodumper/msometa.py b/msodumper/msometa.py
new file mode 100644
index 0000000..8b87284
--- /dev/null
+++ b/msodumper/msometa.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python2
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+import ole
+import ctypes
+import struct
+from docdirstream import DOCDirStream
+import docrecord
+import globals
+import sys
+import os
+import bisect
+
+
+PIDDSI = {
+ 0x00000001: "PIDDSI_CODEPAGE",
+ 0x00000002: "PIDDSI_CATEGORY",
+ 0x00000003: "PIDDSI_PRESFORMAT",
+ 0x00000004: "PIDDSI_BYTECOUNT",
+ 0x00000005: "PIDDSI_LINECOUNT",
+ 0x00000006: "PIDDSI_PARACOUNT",
+ 0x00000007: "PIDDSI_SLIDECOUNT",
+ 0x00000008: "PIDDSI_NOTECOUNT",
+ 0x00000009: "PIDDSI_HIDDENCOUNT",
+ 0x0000000A: "PIDDSI_MMCLIPCOUNT",
+ 0x0000000B: "PIDDSI_SCALE",
+ 0x0000000C: "PIDDSI_HEADINGPAIR",
+ 0x0000000D: "PIDDSI_DOCPARTS",
+ 0x0000000E: "PIDDSI_MANAGER",
+ 0x0000000F: "PIDDSI_COMPANY",
+ 0x00000010: "PIDDSI_LINKSDIRTY",
+ 0x00000011: "PIDDSI_CCHWITHSPACES",
+ 0x00000013: "PIDDSI_SHAREDDOC",
+ 0x00000014: "PIDDSI_LINKBASE",
+ 0x00000015: "PIDDSI_HLINKS",
+ 0x00000016: "PIDDSI_HYPERLINKSCHANGED",
+ 0x00000017: "PIDDSI_VERSION",
+ 0x00000018: "PIDDSI_DIGSIG",
+ 0x0000001A: "PIDDSI_CONTENTTYPE",
+ 0x0000001B: "PIDDSI_CONTENTSTATUS",
+ 0x0000001C: "PIDDSI_LANGUAGE",
+ 0x0000001D: "PIDDSI_DOCVERSION",
+}
+
+
+class DocumentSummaryInformationStream(DOCDirStream):
+ def __init__(self, bytes, params, doc):
+ DOCDirStream.__init__(self, bytes, params, "\x05DocumentSummaryInformation", doc=doc)
+
+ def dump(self):
+ print '<stream name="\\x05DocumentSummaryInformation" size="%d">' % self.size
+ PropertySetStream(self, PIDDSI).dump()
+ print '</stream>'
+
+
+PIDSI = {
+ 0x00000001: "CODEPAGE_PROPERTY_IDENTIFIER",
+ 0x00000002: "PIDSI_TITLE",
+ 0x00000003: "PIDSI_SUBJECT",
+ 0x00000004: "PIDSI_AUTHOR",
+ 0x00000005: "PIDSI_KEYWORDS",
+ 0x00000006: "PIDSI_COMMENTS",
+ 0x00000007: "PIDSI_TEMPLATE",
+ 0x00000008: "PIDSI_LASTAUTHOR",
+ 0x00000009: "PIDSI_REVNUMBER",
+ 0x0000000A: "PIDSI_EDITTIME",
+ 0x0000000B: "PIDSI_LASTPRINTED",
+ 0x0000000C: "PIDSI_CREATE_DTM",
+ 0x0000000D: "PIDSI_LASTSAVE_DTM",
+ 0x0000000E: "PIDSI_PAGECOUNT",
+ 0x0000000F: "PIDSI_WORDCOUNT",
+ 0x00000010: "PIDSI_CHARCOUNT",
+ 0x00000011: "PIDSI_THUMBNAIL",
+ 0x00000012: "PIDSI_APPNAME",
+ 0x00000013: "PIDSI_DOC_SECURITY",
+}
+
+
+class SummaryInformationStream(DOCDirStream):
+ def __init__(self, bytes, params, doc):
+ DOCDirStream.__init__(self, bytes, params, "\x05SummaryInformation", doc=doc)
+
+ def dump(self):
+ print '<stream name="\\x05SummaryInformation" size="%d">' % self.size
+ PropertySetStream(self, PIDSI).dump()
+ print '</stream>'
+
+
+class PropertySetStream(DOCDirStream):
+ def __init__(self, parent, PropertyIds):
+ DOCDirStream.__init__(self, parent.bytes)
+ self.parent = parent
+ self.propertyIds = PropertyIds
+
+ def dump(self):
+ print '<propertySetStream type="PropertySetStream" offset="%s">' % self.pos
+ self.printAndSet("ByteOrder", self.readuInt16())
+ self.printAndSet("Version", self.readuInt16())
+ self.printAndSet("SystemIdentifier", self.readuInt32())
+ self.printAndSet("CLSID0", self.readuInt32())
+ self.printAndSet("CLSID1", self.readuInt32())
+ self.printAndSet("CLSID2", self.readuInt32())
+ self.printAndSet("CLSID3", self.readuInt32())
+ self.printAndSet("NumPropertySets", self.readuInt32())
+ GUID(self, "FMTID0").dump()
+ self.printAndSet("Offset0", self.readuInt32())
+ PropertySet(self, self.Offset0).dump()
+ if self.NumPropertySets == 0x00000002:
+ GUID(self, "FMTID1").dump()
+ self.printAndSet("Offset1", self.readuInt32())
+ self.propertyIds = {}
+ PropertySet(self, self.Offset1).dump()
+ print '</propertySetStream>'
+
+
+class PropertySet(DOCDirStream):
+ def __init__(self, parent, offset):
+ DOCDirStream.__init__(self, parent.bytes)
+ self.parent = parent
+ self.pos = offset
+
+ def getCodePage(self):
+ for index, idAndOffset in enumerate(self.idsAndOffsets):
+ if idAndOffset.PropertyIdentifier == 0x00000001: # CODEPAGE_PROPERTY_IDENTIFIER
+ return self.typedPropertyValues[index].Value
+
+ def dump(self):
+ self.posOrig = self.pos
+ print '<propertySet type="PropertySet" offset="%s">' % self.pos
+ self.printAndSet("Size", self.readuInt32())
+ self.printAndSet("NumProperties", self.readuInt32())
+ self.idsAndOffsets = []
+ for i in range(self.NumProperties):
+ idAndOffset = PropertyIdentifierAndOffset(self, i)
+ idAndOffset.dump()
+ self.idsAndOffsets.append(idAndOffset)
+ self.typedPropertyValues = []
+ for i in range(self.NumProperties):
+ typedPropertyValue = TypedPropertyValue(self, i)
+ typedPropertyValue.dump()
+ self.typedPropertyValues.append(typedPropertyValue)
+ print '</propertySet>'
+
+
+class PropertyIdentifierAndOffset(DOCDirStream):
+ def __init__(self, parent, index):
+ DOCDirStream.__init__(self, parent.bytes)
+ self.parent = parent
+ self.index = index
+ self.pos = parent.pos
+
+ def dump(self):
+ print '<propertyIdentifierAndOffset%s type="PropertyIdentifierAndOffset" offset="%s">' % (self.index, self.pos)
+ self.printAndSet("PropertyIdentifier", self.readuInt32(), dict=self.parent.parent.propertyIds)
+ self.printAndSet("Offset", self.readuInt32())
+ print '</propertyIdentifierAndOffset%s>' % self.index
+ self.parent.pos = self.pos
+
+PropertyType = {
+ 0x0000: "VT_EMPTY",
+ 0x0001: "VT_NULL",
+ 0x0002: "VT_I2",
+ 0x0003: "VT_I4",
+ 0x0004: "VT_R4",
+ 0x0005: "VT_R8",
+ 0x0006: "VT_CY",
+ 0x0007: "VT_DATE",
+ 0x0008: "VT_BSTR",
+ 0x000A: "VT_ERROR",
+ 0x000B: "VT_BOOL",
+ 0x000E: "VT_DECIMAL",
+ 0x0010: "VT_I1",
+ 0x0011: "VT_UI1",
+ 0x0012: "VT_UI2",
+ 0x0013: "VT_UI4",
+ 0x0014: "VT_I8",
+ 0x0015: "VT_UI8",
+ 0x0016: "VT_INT",
+ 0x0017: "VT_UINT",
+ 0x001E: "VT_LPSTR",
+ 0x001F: "VT_LPWSTR",
+ 0x0040: "VT_FILETIME",
+ 0x0041: "VT_BLOB",
+ 0x0042: "VT_STREAM",
+ 0x0043: "VT_STORAGE",
+ 0x0044: "VT_STREAMED_Object",
+ 0x0045: "VT_STORED_Object",
+ 0x0046: "VT_BLOB_Object",
+ 0x0047: "VT_CF",
+ 0x0048: "VT_CLSID",
+ 0x0049: "VT_VERSIONED_STREAM",
+ 0x1002: "VT_VECTOR | VT_I2",
+ 0x1003: "VT_VECTOR | VT_I4",
+ 0x1004: "VT_VECTOR | VT_R4",
+ 0x1005: "VT_VECTOR | VT_R8",
+ 0x1006: "VT_VECTOR | VT_CY",
+ 0x1007: "VT_VECTOR | VT_DATE",
+ 0x1008: "VT_VECTOR | VT_BSTR",
+ 0x100A: "VT_VECTOR | VT_ERROR",
+ 0x100B: "VT_VECTOR | VT_BOOL",
+ 0x100C: "VT_VECTOR | VT_VARIANT",
+ 0x1010: "VT_VECTOR | VT_I1",
+ 0x1011: "VT_VECTOR | VT_UI1",
+ 0x1012: "VT_VECTOR | VT_UI2",
+ 0x1013: "VT_VECTOR | VT_UI4",
+ 0x1014: "VT_VECTOR | VT_I8",
+ 0x1015: "VT_VECTOR | VT_UI8",
+ 0x101E: "VT_VECTOR | VT_LPSTR",
+ 0x101F: "VT_VECTOR | VT_LPWSTR",
+ 0x1040: "VT_VECTOR | VT_FILETIME",
+ 0x1047: "VT_VECTOR | VT_CF",
+ 0x1048: "VT_VECTOR | VT_CLSID",
+ 0x2002: "VT_ARRAY | VT_I2",
+ 0x2003: "VT_ARRAY | VT_I4",
+ 0x2004: "VT_ARRAY | VT_R4",
+ 0x2005: "VT_ARRAY | VT_R8",
+ 0x2006: "VT_ARRAY | VT_CY",
+ 0x2007: "VT_ARRAY | VT_DATE",
+ 0x2008: "VT_ARRAY | VT_BSTR",
+ 0x200A: "VT_ARRAY | VT_ERROR",
+ 0x200B: "VT_ARRAY | VT_BOOL",
+ 0x200C: "VT_ARRAY | VT_VARIANT",
+ 0x200E: "VT_ARRAY | VT_DECIMAL",
+ 0x2010: "VT_ARRAY | VT_I1",
+ 0x2011: "VT_ARRAY | VT_UI1",
+ 0x2012: "VT_ARRAY | VT_UI2",
+ 0x2013: "VT_ARRAY | VT_UI4",
+ 0x2016: "VT_ARRAY | VT_INT",
+ 0x2017: "VT_ARRAY | VT_UINT",
+
+}
+
+
+class TypedPropertyValue(DOCDirStream):
+ def __init__(self, parent, index):
+ DOCDirStream.__init__(self, parent.bytes)
+ self.parent = parent
+ self.index = index
+ self.pos = parent.posOrig + parent.idsAndOffsets[index].Offset
+
+ def dump(self):
+ print '<typedPropertyValue%s type="TypedPropertyValue" offset="%s">' % (self.index, self.pos)
+ self.printAndSet("Type", self.readuInt16(), dict=PropertyType)
+ self.printAndSet("Padding", self.readuInt16())
+ if self.Type == 0x0002: # VT_I2
+ self.printAndSet("Value", self.readInt16())
+ elif self.Type == 0x001E: # VT_LPSTR
+ CodePageString(self, "Value").dump()
+ else:
+ print '<todo what="TypedPropertyValue::dump: unhandled Type %s"/>' % hex(self.Type)
+ print '</typedPropertyValue%s>' % self.index
+
+
+class CodePageString(DOCDirStream):
+ def __init__(self, parent, name):
+ DOCDirStream.__init__(self, parent.bytes)
+ self.pos = parent.pos
+ self.parent = parent
+ self.name = name
+
+ def dump(self):
+ print '<%s type="CodePageString">' % self.name
+ self.printAndSet("Size", self.readuInt32())
+ bytes = []
+ for i in range(self.Size):
+ c = self.readuInt8()
+ if c == 0:
+ break
+ bytes.append(c)
+ codepage = self.parent.parent.getCodePage()
+ if codepage < 0:
+ codepage += 2 ** 16 # signed -> unsigned
+ encoding = ""
+ if codepage == 1252:
+ # http://msdn.microsoft.com/en-us/goglobal/bb964654
+ encoding = "latin1"
+ elif codepage == 65001:
+ # http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130%28v=vs.85%29.aspx
+ encoding = "utf-8"
+ if len(encoding):
+ print '<Characters value="%s"/>' % "".join(map(lambda c: chr(c), bytes)).decode(encoding).encode('utf-8')
+ else:
+ print '<todo what="CodePageString::dump: unhandled codepage %s"/>' % codepage
+ print '</%s>' % self.name
+
+
+class GUID(DOCDirStream):
+ def __init__(self, parent, name):
+ DOCDirStream.__init__(self, parent.bytes)
+ self.pos = parent.pos
+ self.parent = parent
+ self.name = name
+
+ def dump(self):
+ Data1 = self.readuInt32()
+ Data2 = self.readuInt16()
+ Data3 = self.readuInt16()
+ Data4 = []
+ for i in range(8):
+ Data4.append(self.readuInt8())
+ value = "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x" % (Data1, Data2, Data3, Data4[0], Data4[1], Data4[2], Data4[3], Data4[4], Data4[5], Data4[6], Data4[7])
+ print '<%s type="GUID" value="%s"/>' % (self.name, value)
+ self.parent.pos = self.pos
+
+# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
diff --git a/msodumper/vsdstream.py b/msodumper/vsdstream.py
index 4c7cd67..95fca0c 100644
--- a/msodumper/vsdstream.py
+++ b/msodumper/vsdstream.py
@@ -14,6 +14,8 @@ import globals
import sys
import os
import bisect
+from msometa import SummaryInformationStream
+from msometa import DocumentSummaryInformationStream
class VSDFile:
@@ -58,295 +60,4 @@ class VSDFile:
def createVSDFile(chars, params):
return VSDFile(chars, params)
-
-PIDDSI = {
- 0x00000001: "PIDDSI_CODEPAGE",
- 0x00000002: "PIDDSI_CATEGORY",
- 0x00000003: "PIDDSI_PRESFORMAT",
- 0x00000004: "PIDDSI_BYTECOUNT",
- 0x00000005: "PIDDSI_LINECOUNT",
- 0x00000006: "PIDDSI_PARACOUNT",
- 0x00000007: "PIDDSI_SLIDECOUNT",
- 0x00000008: "PIDDSI_NOTECOUNT",
- 0x00000009: "PIDDSI_HIDDENCOUNT",
- 0x0000000A: "PIDDSI_MMCLIPCOUNT",
- 0x0000000B: "PIDDSI_SCALE",
- 0x0000000C: "PIDDSI_HEADINGPAIR",
- 0x0000000D: "PIDDSI_DOCPARTS",
- 0x0000000E: "PIDDSI_MANAGER",
- 0x0000000F: "PIDDSI_COMPANY",
- 0x00000010: "PIDDSI_LINKSDIRTY",
- 0x00000011: "PIDDSI_CCHWITHSPACES",
- 0x00000013: "PIDDSI_SHAREDDOC",
- 0x00000014: "PIDDSI_LINKBASE",
- 0x00000015: "PIDDSI_HLINKS",
- 0x00000016: "PIDDSI_HYPERLINKSCHANGED",
- 0x00000017: "PIDDSI_VERSION",
- 0x00000018: "PIDDSI_DIGSIG",
- 0x0000001A: "PIDDSI_CONTENTTYPE",
- 0x0000001B: "PIDDSI_CONTENTSTATUS",
- 0x0000001C: "PIDDSI_LANGUAGE",
- 0x0000001D: "PIDDSI_DOCVERSION",
-}
-
-
-class DocumentSummaryInformationStream(DOCDirStream):
- def __init__(self, bytes, params, doc):
- DOCDirStream.__init__(self, bytes, params, "\x05DocumentSummaryInformation", doc=doc)
-
- def dump(self):
- print '<stream name="\\x05DocumentSummaryInformation" size="%d">' % self.size
- PropertySetStream(self, PIDDSI).dump()
- print '</stream>'
-
-
-PIDSI = {
- 0x00000001: "CODEPAGE_PROPERTY_IDENTIFIER",
- 0x00000002: "PIDSI_TITLE",
- 0x00000003: "PIDSI_SUBJECT",
- 0x00000004: "PIDSI_AUTHOR",
- 0x00000005: "PIDSI_KEYWORDS",
- 0x00000006: "PIDSI_COMMENTS",
- 0x00000007: "PIDSI_TEMPLATE",
- 0x00000008: "PIDSI_LASTAUTHOR",
- 0x00000009: "PIDSI_REVNUMBER",
- 0x0000000A: "PIDSI_EDITTIME",
- 0x0000000B: "PIDSI_LASTPRINTED",
- 0x0000000C: "PIDSI_CREATE_DTM",
- 0x0000000D: "PIDSI_LASTSAVE_DTM",
- 0x0000000E: "PIDSI_PAGECOUNT",
- 0x0000000F: "PIDSI_WORDCOUNT",
- 0x00000010: "PIDSI_CHARCOUNT",
- 0x00000011: "PIDSI_THUMBNAIL",
- 0x00000012: "PIDSI_APPNAME",
- 0x00000013: "PIDSI_DOC_SECURITY",
-}
-
-
-class SummaryInformationStream(DOCDirStream):
- def __init__(self, bytes, params, doc):
- DOCDirStream.__init__(self, bytes, params, "\x05SummaryInformation", doc=doc)
-
- def dump(self):
- print '<stream name="\\x05SummaryInformation" size="%d">' % self.size
- PropertySetStream(self, PIDSI).dump()
- print '</stream>'
-
-
-class PropertySetStream(DOCDirStream):
- def __init__(self, parent, PropertyIds):
- DOCDirStream.__init__(self, parent.bytes)
- self.parent = parent
- self.propertyIds = PropertyIds
-
- def dump(self):
- print '<propertySetStream type="PropertySetStream" offset="%s">' % self.pos
- self.printAndSet("ByteOrder", self.readuInt16())
- self.printAndSet("Version", self.readuInt16())
- self.printAndSet("SystemIdentifier", self.readuInt32())
- self.printAndSet("CLSID0", self.readuInt32())
- self.printAndSet("CLSID1", self.readuInt32())
- self.printAndSet("CLSID2", self.readuInt32())
- self.printAndSet("CLSID3", self.readuInt32())
- self.printAndSet("NumPropertySets", self.readuInt32())
- GUID(self, "FMTID0").dump()
- self.printAndSet("Offset0", self.readuInt32())
- PropertySet(self, self.Offset0).dump()
- if self.NumPropertySets == 0x00000002:
- GUID(self, "FMTID1").dump()
- self.printAndSet("Offset1", self.readuInt32())
- self.propertyIds = {}
- PropertySet(self, self.Offset1).dump()
- print '</propertySetStream>'
-
-
-class PropertySet(DOCDirStream):
- def __init__(self, parent, offset):
- DOCDirStream.__init__(self, parent.bytes)
- self.parent = parent
- self.pos = offset
-
- def getCodePage(self):
- for index, idAndOffset in enumerate(self.idsAndOffsets):
- if idAndOffset.PropertyIdentifier == 0x00000001: # CODEPAGE_PROPERTY_IDENTIFIER
- return self.typedPropertyValues[index].Value
-
- def dump(self):
- self.posOrig = self.pos
- print '<propertySet type="PropertySet" offset="%s">' % self.pos
- self.printAndSet("Size", self.readuInt32())
- self.printAndSet("NumProperties", self.readuInt32())
- self.idsAndOffsets = []
- for i in range(self.NumProperties):
- idAndOffset = PropertyIdentifierAndOffset(self, i)
- idAndOffset.dump()
- self.idsAndOffsets.append(idAndOffset)
- self.typedPropertyValues = []
- for i in range(self.NumProperties):
- typedPropertyValue = TypedPropertyValue(self, i)
- typedPropertyValue.dump()
- self.typedPropertyValues.append(typedPropertyValue)
- print '</propertySet>'
-
-
-class PropertyIdentifierAndOffset(DOCDirStream):
- def __init__(self, parent, index):
- DOCDirStream.__init__(self, parent.bytes)
- self.parent = parent
- self.index = index
- self.pos = parent.pos
-
- def dump(self):
- print '<propertyIdentifierAndOffset%s type="PropertyIdentifierAndOffset" offset="%s">' % (self.index, self.pos)
- self.printAndSet("PropertyIdentifier", self.readuInt32(), dict=self.parent.parent.propertyIds)
- self.printAndSet("Offset", self.readuInt32())
- print '</propertyIdentifierAndOffset%s>' % self.index
- self.parent.pos = self.pos
-
-PropertyType = {
- 0x0000: "VT_EMPTY",
- 0x0001: "VT_NULL",
- 0x0002: "VT_I2",
- 0x0003: "VT_I4",
- 0x0004: "VT_R4",
- 0x0005: "VT_R8",
- 0x0006: "VT_CY",
- 0x0007: "VT_DATE",
- 0x0008: "VT_BSTR",
- 0x000A: "VT_ERROR",
- 0x000B: "VT_BOOL",
- 0x000E: "VT_DECIMAL",
- 0x0010: "VT_I1",
- 0x0011: "VT_UI1",
- 0x0012: "VT_UI2",
- 0x0013: "VT_UI4",
- 0x0014: "VT_I8",
- 0x0015: "VT_UI8",
- 0x0016: "VT_INT",
- 0x0017: "VT_UINT",
- 0x001E: "VT_LPSTR",
- 0x001F: "VT_LPWSTR",
- 0x0040: "VT_FILETIME",
- 0x0041: "VT_BLOB",
- 0x0042: "VT_STREAM",
- 0x0043: "VT_STORAGE",
- 0x0044: "VT_STREAMED_Object",
- 0x0045: "VT_STORED_Object",
- 0x0046: "VT_BLOB_Object",
- 0x0047: "VT_CF",
- 0x0048: "VT_CLSID",
- 0x0049: "VT_VERSIONED_STREAM",
- 0x1002: "VT_VECTOR | VT_I2",
- 0x1003: "VT_VECTOR | VT_I4",
- 0x1004: "VT_VECTOR | VT_R4",
- 0x1005: "VT_VECTOR | VT_R8",
- 0x1006: "VT_VECTOR | VT_CY",
- 0x1007: "VT_VECTOR | VT_DATE",
- 0x1008: "VT_VECTOR | VT_BSTR",
- 0x100A: "VT_VECTOR | VT_ERROR",
- 0x100B: "VT_VECTOR | VT_BOOL",
- 0x100C: "VT_VECTOR | VT_VARIANT",
- 0x1010: "VT_VECTOR | VT_I1",
- 0x1011: "VT_VECTOR | VT_UI1",
- 0x1012: "VT_VECTOR | VT_UI2",
- 0x1013: "VT_VECTOR | VT_UI4",
- 0x1014: "VT_VECTOR | VT_I8",
- 0x1015: "VT_VECTOR | VT_UI8",
- 0x101E: "VT_VECTOR | VT_LPSTR",
- 0x101F: "VT_VECTOR | VT_LPWSTR",
- 0x1040: "VT_VECTOR | VT_FILETIME",
- 0x1047: "VT_VECTOR | VT_CF",
- 0x1048: "VT_VECTOR | VT_CLSID",
- 0x2002: "VT_ARRAY | VT_I2",
- 0x2003: "VT_ARRAY | VT_I4",
- 0x2004: "VT_ARRAY | VT_R4",
- 0x2005: "VT_ARRAY | VT_R8",
- 0x2006: "VT_ARRAY | VT_CY",
- 0x2007: "VT_ARRAY | VT_DATE",
- 0x2008: "VT_ARRAY | VT_BSTR",
- 0x200A: "VT_ARRAY | VT_ERROR",
- 0x200B: "VT_ARRAY | VT_BOOL",
- 0x200C: "VT_ARRAY | VT_VARIANT",
- 0x200E: "VT_ARRAY | VT_DECIMAL",
- 0x2010: "VT_ARRAY | VT_I1",
- 0x2011: "VT_ARRAY | VT_UI1",
- 0x2012: "VT_ARRAY | VT_UI2",
- 0x2013: "VT_ARRAY | VT_UI4",
- 0x2016: "VT_ARRAY | VT_INT",
- 0x2017: "VT_ARRAY | VT_UINT",
-
-}
-
-
-class TypedPropertyValue(DOCDirStream):
- def __init__(self, parent, index):
- DOCDirStream.__init__(self, parent.bytes)
- self.parent = parent
- self.index = index
- self.pos = parent.posOrig + parent.idsAndOffsets[index].Offset
-
- def dump(self):
- print '<typedPropertyValue%s type="TypedPropertyValue" offset="%s">' % (self.index, self.pos)
- self.printAndSet("Type", self.readuInt16(), dict=PropertyType)
- self.printAndSet("Padding", self.readuInt16())
- if self.Type == 0x0002: # VT_I2
- self.printAndSet("Value", self.readInt16())
- elif self.Type == 0x001E: # VT_LPSTR
- CodePageString(self, "Value").dump()
- else:
- print '<todo what="TypedPropertyValue::dump: unhandled Type %s"/>' % hex(self.Type)
- print '</typedPropertyValue%s>' % self.index
-
-
-class CodePageString(DOCDirStream):
- def __init__(self, parent, name):
- DOCDirStream.__init__(self, parent.bytes)
- self.pos = parent.pos
- self.parent = parent
- self.name = name
-
- def dump(self):
- print '<%s type="CodePageString">' % self.name
- self.printAndSet("Size", self.readuInt32())
- bytes = []
- for i in range(self.Size):
- c = self.readuInt8()
- if c == 0:
- break
- bytes.append(c)
- codepage = self.parent.parent.getCodePage()
- if codepage < 0:
- codepage += 2 ** 16 # signed -> unsigned
- encoding = ""
- if codepage == 1252:
- # http://msdn.microsoft.com/en-us/goglobal/bb964654
- encoding = "latin1"
- elif codepage == 65001:
- # http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130%28v=vs.85%29.aspx
- encoding = "utf-8"
- if len(encoding):
- print '<Characters value="%s"/>' % "".join(map(lambda c: chr(c), bytes)).decode(encoding).encode('utf-8')
- else:
- print '<todo what="CodePageString::dump: unhandled codepage %s"/>' % codepage
- print '</%s>' % self.name
-
-
-class GUID(DOCDirStream):
- def __init__(self, parent, name):
- DOCDirStream.__init__(self, parent.bytes)
- self.pos = parent.pos
- self.parent = parent
- self.name = name
-
- def dump(self):
- Data1 = self.readuInt32()
- Data2 = self.readuInt16()
- Data3 = self.readuInt16()
- Data4 = []
- for i in range(8):
- Data4.append(self.readuInt8())
- value = "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x" % (Data1, Data2, Data3, Data4[0], Data4[1], Data4[2], Data4[3], Data4[4], Data4[5], Data4[6], Data4[7])
- print '<%s type="GUID" value="%s"/>' % (self.name, value)
- self.parent.pos = self.pos
-
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
More information about the Libreoffice-commits
mailing list