[Libreoffice-commits] mso-dumper.git: 3 commits - doc-dump.py src/docstream.py
Miklos Vajna
vmiklos at kemper.freedesktop.org
Fri May 17 07:57:58 PDT 2013
doc-dump.py | 4 +--
src/docstream.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 66 insertions(+), 4 deletions(-)
New commits:
commit ea28062eb49560c8917d19d6e20152d851858e6e
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Fri May 17 16:57:51 2013 +0200
give usable error message on ole-based ww6 input
diff --git a/src/docstream.py b/src/docstream.py
index a94d0d3..5c024fe 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -187,6 +187,9 @@ class WordDocumentStream(DOCDirStream):
self.printAndSet("wIdent", self.readuInt16())
self.printAndSet("nFib", self.readuInt16())
+ if self.nFib >= 0x65 and self.nFib <= 0x69:
+ print '<todo what="handle nFib 0x65..0x69: ww6 syntax"/>'
+ ret = False
self.printAndSet("unused", self.readuInt16())
self.printAndSet("lid", self.readuInt16())
self.printAndSet("pnNext", self.readuInt16())
commit 0af1cd44f27a2f93ebcaa9265ad3e4f92107d2a3
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Fri May 17 15:29:16 2013 +0200
doc: use gsf to read the ole streams when it's available
The GSF parser can deal with e.g. fdo33590-1.doc
diff --git a/doc-dump.py b/doc-dump.py
index 17aec51..b99d6fe 100755
--- a/doc-dump.py
+++ b/doc-dump.py
@@ -19,10 +19,10 @@ class DOCDumper:
def dump(self):
file = open(self.filepath, 'rb')
- strm = docstream.DOCFile(file.read(), self.params)
+ strm = docstream.createDOCFile(file.read(), self.params)
file.close()
dirnames = strm.getDirectoryNames()
- print '<?xml version="1.0"?>\n<streams>'
+ print '<?xml version="1.0"?>\n<streams ole-type="%s">' % strm.getName()
for dirname in dirnames:
if len(dirname) == 0 or dirname in ['Root Entry']:
continue
diff --git a/src/docstream.py b/src/docstream.py
index e9a674f..a94d0d3 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -6,6 +6,7 @@
#
import ole
+import ctypes
import struct
from docdirstream import DOCDirStream
import docrecord
@@ -54,6 +55,58 @@ class DOCFile:
else:
return DOCDirStream(bytes, self.params, name, doc=self)
+ def getName(self):
+ return "native"
+
+class GsfDOCFile(DOCFile):
+ """Same as DOCFile, but uses gsf to read the OLE streams."""
+ def __init__ (self, chars, params, gsf):
+ self.gsf = gsf
+ DOCFile.__init__(self, chars, params)
+
+ def initWW8(self):
+ self.streams = {}
+ self.gsf.gsf_init()
+ gsfInput = self.gsf.gsf_input_memory_new(self.chars, len(self.chars), False)
+ gsfInfile = self.gsf.gsf_infile_msole_new(gsfInput)
+ for i in range(self.gsf.gsf_infile_num_children(gsfInfile)):
+ child = self.gsf.gsf_infile_child_by_index(gsfInfile, i)
+ childName = ctypes.string_at(self.gsf.gsf_infile_name_by_index(gsfInfile,i))
+ childSize = self.gsf.gsf_input_size(child)
+ childData = ""
+ while True:
+ bufSize = 1024
+ pos = self.gsf.gsf_input_tell(child)
+ if pos == childSize:
+ break
+ elif pos + bufSize > childSize:
+ bufSize = childSize - pos
+ childData += ctypes.string_at(self.gsf.gsf_input_read(child, bufSize, None), bufSize)
+ self.streams[childName] = childData
+ self.gsf.gsf_shutdown()
+
+ def getDirectoryNames(self):
+ return self.streams.keys()
+
+ def getDirectoryStreamByName(self, name):
+ return self.getStreamFromBytes(name, self.streams[name])
+
+ def getName(self):
+ return "gsf"
+
+def createDOCFile(chars, params):
+ hasGsf = True
+ try:
+ gsf = ctypes.cdll.LoadLibrary('libgsf-1.so')
+ gsf.gsf_input_read.restype = ctypes.c_void_p
+ except:
+ hasGsf = False
+
+ if hasGsf:
+ return GsfDOCFile(chars, params, gsf)
+ else:
+ return DOCFile(chars, params)
+
class TableStream(DOCDirStream):
def __init__(self, bytes, params, name, doc):
DOCDirStream.__init__(self, bytes, params, name, doc = doc)
commit a2522a47ae8ea503130e8e662d630a47cbdb5d95
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Fri May 17 15:08:47 2013 +0200
doc: refactor to separate code that is specific to our own ole parser
diff --git a/src/docstream.py b/src/docstream.py
index e888299..e9a674f 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -20,8 +20,7 @@ class DOCFile:
self.params = params
if ord(self.chars[0]) == 0xD0 and ord(self.chars[1]) == 0xCF and ord(self.chars[2]) == 0x11 and ord(self.chars[3]) == 0xE0:
- self.header = ole.Header(self.chars, self.params)
- self.pos = self.header.parse()
+ self.initWW8()
else:
print '<?xml version="1.0"?>'
if ord(self.chars[0]) == 0xDB and ord(self.chars[1]) == 0xA5:
@@ -30,6 +29,10 @@ class DOCFile:
print '<todo what="unhandled magic"/>'
sys.exit(0)
+ def initWW8(self):
+ self.header = ole.Header(self.chars, self.params)
+ self.pos = self.header.parse()
+
def __getDirectoryObj(self):
obj = self.header.getDirectory()
obj.parseDirEntries()
@@ -41,6 +44,9 @@ class DOCFile:
def getDirectoryStreamByName(self, name):
obj = self.__getDirectoryObj()
bytes = obj.getRawStreamByName(name)
+ return self.getStreamFromBytes(name, bytes)
+
+ def getStreamFromBytes(self, name, bytes):
if name == "WordDocument":
return WordDocumentStream(bytes, self.params, doc=self)
if name == "1Table":
More information about the Libreoffice-commits
mailing list