[Libreoffice-commits] mso-dumper.git: 3 commits - doc-dump.py src/docstream.py

Miklos Vajna vmiklos at kemper.freedesktop.org
Fri May 17 07:57:58 PDT 2013


 doc-dump.py      |    4 +--
 src/docstream.py |   66 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 66 insertions(+), 4 deletions(-)

New commits:
commit ea28062eb49560c8917d19d6e20152d851858e6e
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Fri May 17 16:57:51 2013 +0200

    give usable error message on ole-based ww6 input

diff --git a/src/docstream.py b/src/docstream.py
index a94d0d3..5c024fe 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -187,6 +187,9 @@ class WordDocumentStream(DOCDirStream):
 
         self.printAndSet("wIdent", self.readuInt16())
         self.printAndSet("nFib", self.readuInt16())
+        if self.nFib >= 0x65 and self.nFib <= 0x69:
+            print '<todo what="handle nFib 0x65..0x69: ww6 syntax"/>'
+            ret = False
         self.printAndSet("unused", self.readuInt16())
         self.printAndSet("lid", self.readuInt16())
         self.printAndSet("pnNext", self.readuInt16())
commit 0af1cd44f27a2f93ebcaa9265ad3e4f92107d2a3
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Fri May 17 15:29:16 2013 +0200

    doc: use gsf to read the ole streams when it's available
    
    The GSF parser can deal with e.g. fdo33590-1.doc

diff --git a/doc-dump.py b/doc-dump.py
index 17aec51..b99d6fe 100755
--- a/doc-dump.py
+++ b/doc-dump.py
@@ -19,10 +19,10 @@ class DOCDumper:
 
     def dump(self):
         file = open(self.filepath, 'rb')
-        strm = docstream.DOCFile(file.read(), self.params)
+        strm = docstream.createDOCFile(file.read(), self.params)
         file.close()
         dirnames = strm.getDirectoryNames()
-        print '<?xml version="1.0"?>\n<streams>'
+        print '<?xml version="1.0"?>\n<streams ole-type="%s">' % strm.getName()
         for dirname in dirnames:
             if len(dirname) == 0 or dirname in ['Root Entry']:
                 continue
diff --git a/src/docstream.py b/src/docstream.py
index e9a674f..a94d0d3 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -6,6 +6,7 @@
 #
 
 import ole
+import ctypes
 import struct
 from docdirstream import DOCDirStream
 import docrecord
@@ -54,6 +55,58 @@ class DOCFile:
         else:
             return DOCDirStream(bytes, self.params, name, doc=self)
 
+    def getName(self):
+        return "native"
+
+class GsfDOCFile(DOCFile):
+    """Same as DOCFile, but uses gsf to read the OLE streams."""
+    def __init__ (self, chars, params, gsf):
+        self.gsf = gsf
+        DOCFile.__init__(self, chars, params)
+
+    def initWW8(self):
+        self.streams = {}
+        self.gsf.gsf_init()
+        gsfInput = self.gsf.gsf_input_memory_new(self.chars, len(self.chars), False)
+        gsfInfile = self.gsf.gsf_infile_msole_new(gsfInput)
+        for i in range(self.gsf.gsf_infile_num_children(gsfInfile)):
+            child = self.gsf.gsf_infile_child_by_index(gsfInfile, i)
+            childName = ctypes.string_at(self.gsf.gsf_infile_name_by_index(gsfInfile,i))
+            childSize = self.gsf.gsf_input_size(child)
+            childData = ""
+            while True:
+                bufSize = 1024
+                pos = self.gsf.gsf_input_tell(child)
+                if pos == childSize:
+                    break
+                elif pos + bufSize > childSize:
+                    bufSize = childSize - pos
+                childData += ctypes.string_at(self.gsf.gsf_input_read(child, bufSize, None), bufSize)
+            self.streams[childName] = childData
+        self.gsf.gsf_shutdown()
+
+    def getDirectoryNames(self):
+        return self.streams.keys()
+
+    def getDirectoryStreamByName(self, name):
+        return self.getStreamFromBytes(name, self.streams[name])
+
+    def getName(self):
+        return "gsf"
+
+def createDOCFile(chars, params):
+    hasGsf = True
+    try:
+        gsf = ctypes.cdll.LoadLibrary('libgsf-1.so')
+        gsf.gsf_input_read.restype = ctypes.c_void_p
+    except:
+        hasGsf = False
+
+    if hasGsf:
+        return GsfDOCFile(chars, params, gsf)
+    else:
+        return DOCFile(chars, params)
+
 class TableStream(DOCDirStream):
     def __init__(self, bytes, params, name, doc):
         DOCDirStream.__init__(self, bytes, params, name, doc = doc)
commit a2522a47ae8ea503130e8e662d630a47cbdb5d95
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Fri May 17 15:08:47 2013 +0200

    doc: refactor to separate code that is specific to our own ole parser

diff --git a/src/docstream.py b/src/docstream.py
index e888299..e9a674f 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -20,8 +20,7 @@ class DOCFile:
         self.params = params
 
         if ord(self.chars[0]) == 0xD0 and ord(self.chars[1]) == 0xCF and ord(self.chars[2]) == 0x11 and ord(self.chars[3]) == 0xE0:
-            self.header = ole.Header(self.chars, self.params)
-            self.pos = self.header.parse()
+            self.initWW8()
         else:
             print '<?xml version="1.0"?>'
             if ord(self.chars[0]) == 0xDB and ord(self.chars[1]) == 0xA5:
@@ -30,6 +29,10 @@ class DOCFile:
                 print '<todo what="unhandled magic"/>'
             sys.exit(0)
 
+    def initWW8(self):
+            self.header = ole.Header(self.chars, self.params)
+            self.pos = self.header.parse()
+
     def __getDirectoryObj(self):
         obj = self.header.getDirectory()
         obj.parseDirEntries()
@@ -41,6 +44,9 @@ class DOCFile:
     def getDirectoryStreamByName(self, name):
         obj = self.__getDirectoryObj()
         bytes = obj.getRawStreamByName(name)
+        return self.getStreamFromBytes(name, bytes)
+
+    def getStreamFromBytes(self, name, bytes):
         if name == "WordDocument":
             return WordDocumentStream(bytes, self.params, doc=self)
         if name == "1Table":


More information about the Libreoffice-commits mailing list