[Libreoffice-commits] mso-dumper.git: 6 commits - src/docdirstream.py src/docrecord.py src/docstream.py src/globals.py

Sun Aug 18 08:13:47 PDT 2013

src/docdirstream.py |    9 ++++++---
 src/docrecord.py    |   40 ++++++++++++++++++++++++++++++++--------
 src/docstream.py    |    9 +++++++--
 src/globals.py      |    7 +++++--
 4 files changed, 50 insertions(+), 15 deletions(-)

New commits:
commit 806678b9a755eb7f304a855c1efe23e86901cea4
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Sun Aug 18 16:07:54 2013 +0200

    dump grfhic

diff --git a/src/docrecord.py b/src/docrecord.py
index a0d6cd4..61be2b4 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -3405,6 +3405,28 @@ class PlcfGram(DOCDirStream, PLC):
             print '</aCP>'
         print '</plcfGram>'
 
+class Grfhic(DOCDirStream):
+    """The grfhic structure is a set of HTML incompatibility flags that specify
+    the HTML incompatibilities of a list structure."""
+    def __init__(self, parent):
+        DOCDirStream.__init__(self, parent.bytes)
+        self.pos = parent.pos
+        self.parent = parent
+
+    def dump(self):
+        print '<grfhic type="grfhic">'
+        buf = self.readuInt8()
+        self.printAndSet("fhicChecked",  self.getBit(buf, 0))
+        self.printAndSet("fhicFormat",   self.getBit(buf, 1))
+        self.printAndSet("fhicListText", self.getBit(buf, 2))
+        self.printAndSet("fhicPeriod",   self.getBit(buf, 3))
+        self.printAndSet("fhicLeft1",    self.getBit(buf, 4))
+        self.printAndSet("fhicListTab",  self.getBit(buf, 5))
+        self.printAndSet("unused",       self.getBit(buf, 6))
+        self.printAndSet("fhicBullet",   self.getBit(buf, 7))
+        self.parent.pos = self.pos
+        print '</grfhic>'
+
 class LSTF(DOCDirStream):
     """The LSTF structure contains formatting properties that apply to an entire list."""
     def __init__(self, plfLst, index):
@@ -3426,7 +3448,7 @@ class LSTF(DOCDirStream):
         self.printAndSet("unused2", self.getBit(buf, 3))
         self.printAndSet("fHybrid", self.getBit(buf, 4))
         self.printAndSet("reserved1", (buf & 0xe0) >> 5) # 6..8th bits
-        self.printAndSet("grfhic", self.readuInt8()) # TODO dump grfhic
+        Grfhic(self).dump()
         print '</lstf>'
 
 class LVLF(DOCDirStream):
@@ -3455,7 +3477,7 @@ class LVLF(DOCDirStream):
         self.printAndSet("cbGrpprlChpx", self.readuInt8())
         self.printAndSet("cbGrpprlPapx", self.readuInt8())
         self.printAndSet("ilvlRestartLim", self.readuInt8())
-        self.printAndSet("grfhic", self.readuInt8()) # TODO dump grfhic
+        Grfhic(self).dump()
         print '</lvlf>'
 
 class LVL(DOCDirStream):
@@ -3531,7 +3553,7 @@ class LFO(DOCDirStream):
         self.printAndSet("unused2", self.readuInt32())
         self.printAndSet("clfolvl", self.readuInt8())
         self.printAndSet("ibstFltAutoNum", self.readuInt8())
-        self.printAndSet("grfhic", self.readuInt8()) # TODO dump grfhic
+        Grfhic(self).dump()
         self.printAndSet("unused3", self.readuInt8())
         print '</lfo>'
 
commit 7ebcbd32d92043bcc998e6dca2067ea86d5d9934
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Sun Aug 18 15:58:25 2013 +0200

    the spec says lcbPlcfLvcPre10 should be ignored as well

diff --git a/src/docstream.py b/src/docstream.py
index 22695d4..89cd0d7 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -520,7 +520,7 @@ class WordDocumentStream(DOCDirStream):
             value = self.readInt32()
             hasHandler = len(i) > 1
             # the spec says these must be ignored
-            needsIgnoring = ["lcbStshfOrig", "lcbPlcfBteLvc"]
+            needsIgnoring = ["lcbStshfOrig", "lcbPlcfBteLvc", "lcbPlcfLvcPre10"]
             # a member needs handling if it defines the size of a struct and it's non-zero
             needsHandling = i[0].startswith("lcb") and value != 0 and (not i[0] in needsIgnoring)
             self.printAndSet(i[0], value, end = ((not hasHandler) and (not needsHandling)), offset = True)
commit dea1e63ac3e62ca67192d39177c1de26fb15fd77
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Sun Aug 18 15:38:58 2013 +0200

    WordDocumentStream: don't throw on invalid utf16
    
    Just print a warning instead. ooo101417-1.doc triggers this.

diff --git a/src/docdirstream.py b/src/docdirstream.py
index 9888638..5cf1493 100644
--- a/src/docdirstream.py
+++ b/src/docdirstream.py
@@ -134,7 +134,7 @@ class DOCDirStream:
             else:
                 break
             count += 1
-        return globals.getUTF8FromUTF16("".join(map(lambda x: chr(x), bytes)))
+        return globals.getUTF8FromUTF16("".join(map(lambda x: chr(x), bytes)), xml = True)
 
     def getBit(self, byte, bitNumber):
         return (byte & (1 << bitNumber)) >> bitNumber
diff --git a/src/docstream.py b/src/docstream.py
index 398207c..22695d4 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -948,7 +948,12 @@ class WordDocumentStream(DOCDirStream):
         if compressed:
             return globals.encodeName(self.bytes[pos])
         else:
-            return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16'), lowOnly = True)
+            try:
+                return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16'), lowOnly = True)
+            except UnicodeDecodeError:
+                reason = 'could not decode bytes in position %d-%d (%s-%s)' % (pos, pos+1, hex(ord(self.bytes[pos])), hex(ord(self.bytes[pos+1])))
+                print '<todo what="WordDocumentStream::retrieveCP(): %s"/>' % reason
+                return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16', errors="replace"), lowOnly = True)
 
     def retrieveCPs(self, start, end):
         """Retrieves a range of characters."""
diff --git a/src/globals.py b/src/globals.py
index 68aae93..31e995a 100644
--- a/src/globals.py
+++ b/src/globals.py
@@ -412,7 +412,7 @@ def getDouble (bytes):
     return struct.unpack('<d', text)[0]
 
 
-def getUTF8FromUTF16 (bytes):
+def getUTF8FromUTF16 (bytes, xml = False):
     # little endian utf-16 strings
     byteCount = len(bytes)
     loopCount = int(byteCount/2)
@@ -431,7 +431,10 @@ def getUTF8FromUTF16 (bytes):
         try:    
             text += unicode(code, 'utf-8')
         except UnicodeDecodeError:
-            text += "<%d invalid chars>"%len(code)
+            close = ""
+            if xml:
+                close="/"
+            text += "<%d invalid chars%s>"%(len(code), close)
     return text
 
 class StreamWrap(object):
commit 4031eb1e626a9c66c0c306708b6ea96bbfb468f6
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Sun Aug 18 15:02:58 2013 +0200

    TCGRF: tolerate undocumented TextFlow
    
    As seen in ooo100632-2.doc.

diff --git a/src/docdirstream.py b/src/docdirstream.py
index c3e4207..9888638 100644
--- a/src/docdirstream.py
+++ b/src/docdirstream.py
@@ -21,13 +21,16 @@ class DOCDirStream:
         self.mainStream = mainStream
         self.doc = doc
     
-    def printAndSet(self, key, value, hexdump = True, end = True, offset = False, silent = False, dict = None):
+    def printAndSet(self, key, value, hexdump = True, end = True, offset = False, silent = False, dict = None, default = None):
         setattr(self, key, value)
         if silent:
             return
         attrs = ""
         if dict:
-            attrs += ' name="%s"' % dict[value]
+            if value in dict or not default:
+                attrs += ' name="%s"' % dict[value]
+            else:
+                attrs += ' name="%s"' % default
         if hexdump:
             value = hex(value)
         if offset:
diff --git a/src/docrecord.py b/src/docrecord.py
index a9d99db..a0d6cd4 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -1072,7 +1072,7 @@ class TCGRF(DOCDirStream):
         print '<tcgrf type="TCGRF" offset="%d">' % self.pos
         buf = self.readuInt16()
         self.printAndSet("horzMerge", buf & 0x0003) # 1..2nd bits
-        self.printAndSet("textFlow",  (buf & 0x001c) >> 2, dict = TextFlow) # 3..6th bits
+        self.printAndSet("textFlow",  (buf & 0x001c) >> 2, dict = TextFlow, default = "todo") # 3..6th bits
         self.printAndSet("vertMerge", (buf & 0x0060) >> 6, dict = VerticalMergeFlag) # 7..8th bits
         self.printAndSet("vertAlign", (buf & 0x0180) >> 8, dict = VerticalAlign) # 9..10th bits
         self.printAndSet("ftsWidth",  (buf & 0x0e00) >> 10, dict = Fts) # 11..12th bits
commit e8fd0762c67fab15929288719015c9af5b57fb4c
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Sun Aug 18 13:01:28 2013 +0200

    PICFAndofficeArtData: don't throw on MM_SHAPEFILE
    
    fdo54551-1.doc triggered this, with the change it only properly prints a
    TODO.

diff --git a/src/docrecord.py b/src/docrecord.py
index 66ff07f..a9d99db 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -1019,8 +1019,9 @@ class PICFAndOfficeArtData(DOCDirStream):
             assert self.pos == pos + 68
             if picf.mfpf.mm == 0x0066:
                 print '<todo what="PICFAndOfficeArtData::dump(): picf.mfpf.mm == MM_SHAPEFILE is unhandled"/>'
-            remaining = picf.lcb - (self.pos - pos)
-            msodraw.InlineSpContainer(self, remaining).dumpXml(self, getWordModel(self.parent.mainStream))
+            else:
+                remaining = picf.lcb - (self.pos - pos)
+                msodraw.InlineSpContainer(self, remaining).dumpXml(self, getWordModel(self.parent.mainStream))
         else:
             print '<todo what="PICFAndOfficeArtData::dump(): handle sprmCFData or sprmCFOle2"/>'
         print '</PICFAndOfficeArtData>'
commit 8f43c92520fb01066e7c287eaa2ba69b9ebe74fe
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Sun Aug 18 11:51:35 2013 +0200

    PICFAndOfficeArtData: blacklist sprmCFOle2 as well
    
    According to the spec, this should not occur with a 0x01 placeholder
    char, but fdo48097-1.doc has it.

diff --git a/src/docrecord.py b/src/docrecord.py
index e424e65..66ff07f 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -1009,8 +1009,9 @@ class PICFAndOfficeArtData(DOCDirStream):
         print '<PICFAndOfficeArtData>'
         found = False
         for prl in self.parent.parent.parent.prls:
-            if prl.sprm.sprm == 0x0806: # sprmCFData
+            if prl.sprm.sprm in (0x0806, 0x080a): # sprmCFData, sprmCFOle2
                 found = True
+                break
         if not found:
             pos = self.pos
             picf = PICF(self)
@@ -1021,7 +1022,7 @@ class PICFAndOfficeArtData(DOCDirStream):
             remaining = picf.lcb - (self.pos - pos)
             msodraw.InlineSpContainer(self, remaining).dumpXml(self, getWordModel(self.parent.mainStream))
         else:
-            print '<todo what="PICFAndOfficeArtData::dump(): handle sprmCFData"/>'
+            print '<todo what="PICFAndOfficeArtData::dump(): handle sprmCFData or sprmCFOle2"/>'
         print '</PICFAndOfficeArtData>'
 
 # The TextFlow enumeration specifies the rotation settings for a block of text and for the individual