[Libreoffice-commits] mso-dumper.git: 3 commits - msodumper/globals.py msodumper/node.py msodumper/ole.py msodumper/xlsrecord.py msodumper/xlsstream.py

Thu Jan 9 10:43:18 PST 2014

msodumper/globals.py   |   86 +++++++++++++++++++++++++++++++++++++++----------
 msodumper/node.py      |    9 +++--
 msodumper/ole.py       |    2 +
 msodumper/xlsrecord.py |   40 ++++++++++++++--------
 msodumper/xlsstream.py |   49 ++++++++++++++++++++++-----
 5 files changed, 144 insertions(+), 42 deletions(-)

New commits:
commit 595542f5865f13aee61b161f468c4ee8b25cf6ad
Author: Jean-Francois Dockes <jf at dockes.org>
Date:   Thu Jan 9 08:20:21 2014 +0100

    Accept shorter BOF record

diff --git a/msodumper/xlsrecord.py b/msodumper/xlsrecord.py
index 0c55b60..ac070f4 100644
--- a/msodumper/xlsrecord.py
+++ b/msodumper/xlsrecord.py
@@ -627,16 +627,27 @@ class BOF(BaseRecordHandler):
         self.buildYear = self.readUnsignedInt(2)
 
         # file history flags
-        self.flags = self.readUnsignedInt(4)
-        self.win     = (self.flags & 0x00000001)
-        self.risc    = (self.flags & 0x00000002)
-        self.beta    = (self.flags & 0x00000004)
-        self.winAny  = (self.flags & 0x00000008)
-        self.macAny  = (self.flags & 0x00000010)
-        self.betaAny = (self.flags & 0x00000020)
-        self.riscAny = (self.flags & 0x00000100)
-        self.lowestExcelVer = self.readSignedInt(4)
-
+        try:
+            self.flags = self.readUnsignedInt(4)
+            self.win     = (self.flags & 0x00000001)
+            self.risc    = (self.flags & 0x00000002)
+            self.beta    = (self.flags & 0x00000004)
+            self.winAny  = (self.flags & 0x00000008)
+            self.macAny  = (self.flags & 0x00000010)
+            self.betaAny = (self.flags & 0x00000020)
+            self.riscAny = (self.flags & 0x00000100)
+            self.lowestExcelVer = self.readSignedInt(4)
+        except:
+            self.flags = 0
+            self.win     = 0
+            self.risc    = 0
+            self.beta    = 0
+            self.winAny  = 0
+            self.macAny  = 0
+            self.betaAny = 0
+            self.riscAny = 0
+            self.lowestExcelVer = 0
+            
     def parseBytes (self):
         self.__parseBytes()
         # BIFF version
commit 2d4f7cf511275a7a6103bce1d75963354e7e14fa
Author: Jean-Francois Dockes <jf at dockes.org>
Date:   Wed Jan 8 17:20:29 2014 +0100

    Process CONTINUE records by appending them to their base. Specially process
    SST CONTINUE records by handling the string compression byte (grbit) at the
    beginning of each CONTINUE record.

diff --git a/msodumper/globals.py b/msodumper/globals.py
index ca817ec..276a317 100644
--- a/msodumper/globals.py
+++ b/msodumper/globals.py
@@ -164,25 +164,55 @@ def encodeName (name, lowOnly = False, lowLimit = 0x20):
 
     return newname
 
+# Uncompress "compressed" UTF-16. This compression strips high bytes
+# from a string when they are all 0. Just restore them.
+def uncompCompUnicode(bytes):
+    out = ""
+    for b in bytes:
+        out += b
+        out += '\0'
+    return out
 
 class UnicodeRichExtText(object):
     def __init__ (self):
-        self.baseText = ''
+        self.baseText = unicode()
         self.phoneticBytes = []
 
+# Linear search for index of first element in sorted list strictly
+# bigger than a given value. Might be converted to binary search, but our
+# lists (CONTINUE record offsets) are small. If the returned index is
+# the list size (last valid index+1), the input value is beyond the
+# max list value
+def find_first_bigger(ilist, value):
+    i = 0
+    while i < len(ilist) and value >= ilist[i]:
+        i +=1
+    return i
 
-def getUnicodeRichExtText (bytes):
+def getUnicodeRichExtText (bytes, offset = 0, rofflist = []):
+    if len(rofflist) == 0:
+        rofflist = [len(bytes)]
     ret = UnicodeRichExtText()
     # Avoid myriad of messages when in "catching" mode
     if params.catchExceptions and (bytes is None or len(bytes) == 0):
         return ret, 0
+
+    if len(rofflist) == 0 or rofflist[len(rofflist)-1] != len(bytes):
+        error("bad input to getUnicodeRichExtText: empty offset list or last offset != size. size %d list %s" % (len(bytes), str(rofflist)))
+        raise ByteStreamError()
+
     strm = ByteStream(bytes)
+    strm.setCurrentPos(offset)
+
     try:
         textLen = strm.readUnsignedInt(2)
         flags = strm.readUnsignedInt(1)
         #  0 0 0 0 0 0 0 0
         # |-------|D|C|B|A|
-        isDoubleByte = (flags & 0x01) > 0 # A
+        if (flags & 0x01) > 0: # A
+            bytesPerChar = 2
+        else:
+            bytesPerChar = 1
         ignored      = (flags & 0x02) > 0 # B
         hasPhonetic  = (flags & 0x04) > 0 # C
         isRichStr    = (flags & 0x08) > 0 # D
@@ -195,18 +225,42 @@ def getUnicodeRichExtText (bytes):
         if hasPhonetic:
             phoneticBytes = strm.readUnsignedInt(4)
 
-        if isDoubleByte:
-            # double-byte string (UTF-16)
-            ret.baseText = \
-                unicode(strm.readBytes(2*textLen), 'UTF-16LE', errors='replace')
-        else:
-            # "Compressed Unicode" string. UTF-16 without the zero
-            # octets. These have to be latin1
-            if params.utf8:
-                ret.baseText = strm.readBytes(textLen).decode('cp1252')
-            else:
-                # If utf8 is not set, we'll print hex bytes, keep data as is
-                ret.baseText = strm.readBytes(textLen)
+        # Reading the string proper. This is made a bit more
+        # complicated by the fact that the format can switch from
+        # compressed (latin data with high zeros stripped) to normal
+        # (UTF-16LE) whenever a string encounters a CONTINUE record
+        # boundary. The new format is indicated by a single byte at
+        # the start of the CONTINUE record payload.
+        while textLen > 0:
+            #print("Reading Unicode with bytesPerChar %d" % bytesPerChar)
+            bytesToRead = textLen * bytesPerChar
+
+            # Truncate to next record boundary
+            ibound = find_first_bigger(rofflist, strm.getCurrentPos())
+            if ibound == len(rofflist):
+                # Just try to read and let the stream raise an exception
+                strm.readBytes(bytesToRead)
+                return
+            
+            bytesToRead = min(bytesToRead, \
+                              rofflist[ibound]- strm.getCurrentPos())
+            newdata = strm.readBytes(bytesToRead)
+            if bytesPerChar == 1:
+                newdata = uncompCompUnicode(newdata)
+
+            ret.baseText +=  unicode(newdata, 'UTF-16LE', errors='replace')
+
+            textLen -= bytesToRead // bytesPerChar
+            
+            # If there is still data to read, we hit a record boundary. Read
+            # the grbit byte for detecting possible compression switch
+            if textLen > 0:
+                grbit = strm.readUnsignedInt(1)
+                if (grbit & 1) != 0:
+                    bytesPerChar = 2
+                else:
+                    bytesPerChar = 1
+                
         if isRichStr:
             for i in xrange(0, numElem):
                 posChar = strm.readUnsignedInt(2)
@@ -219,7 +273,7 @@ def getUnicodeRichExtText (bytes):
             raise
         error("getUnicodeRichExtText: %s\n" % e)
         return ret, len(bytes)
-    return ret, strm.getCurrentPos()
+    return ret, strm.getCurrentPos() - offset
 
 
 def getRichText (bytes, textLen=None):
diff --git a/msodumper/node.py b/msodumper/node.py
index abf67ca..bab92bd 100644
--- a/msodumper/node.py
+++ b/msodumper/node.py
@@ -128,15 +128,18 @@ def encodeString (sin, utf8 = False):
             sout1 = sin.encode('UTF-8')
         else:
             sout1 = sin
-        # Escape special characters as entities
+        # Escape special characters as entities. Can't keep zero bytes either
+        # (bad XML). They can only arrive here if there is a bug somewhere.
         for c in sout1:
-            if c in encodeTable:
+            if ord(c) == 0:
+                sout += '(nullbyte)'
+            elif c in encodeTable:
                 sout += '&' + encodeTable[c] + ';'
             else:
                 sout += c
     else:
         for c in sin:
-            if ord(c) >= 128:
+            if ord(c) >= 128 or ord(c) == 0:
                 # encode non-ascii ranges.
                 sout += "\\x%2.2x"%ord(c)
             elif encodeTable.has_key(c):
diff --git a/msodumper/xlsrecord.py b/msodumper/xlsrecord.py
index 5fa3fc6..0c55b60 100644
--- a/msodumper/xlsrecord.py
+++ b/msodumper/xlsrecord.py
@@ -249,11 +249,12 @@ class DXFN12NoCB(object):
 
 class BaseRecordHandler(globals.ByteStream):
 
-    def __init__ (self, header, size, bytes, strmData):
+    def __init__ (self, header, size, bytes, strmData, roflist = []):
         globals.ByteStream.__init__(self, bytes)
         self.header = header
         self.lines = []
         self.strmData = strmData
+        self.roflist = roflist
 
     def parseBytes (self):
         """Parse the original bytes and generate human readable output.
@@ -1599,7 +1600,7 @@ class SST(BaseRecordHandler):
         self.strCount = self.readSignedInt(4) # total number of unique strings.
         self.sharedStrings = []
         for i in xrange(0, self.strCount):
-            extText, bytesRead = globals.getUnicodeRichExtText(self.bytes[self.getCurrentPos():])
+            extText, bytesRead = globals.getUnicodeRichExtText(self.bytes, self.getCurrentPos(), self.roflist)
             self.readBytes(bytesRead) # advance current position.
             self.sharedStrings.append(extText)
 
@@ -1922,7 +1923,7 @@ class SupBook(BaseRecordHandler):
         self.moveBack(2)
         pos = self.getCurrentPos()
         while pos < self.size:
-            ret, bytesLen = globals.getUnicodeRichExtText(self.bytes[pos:])
+            ret, bytesLen = globals.getUnicodeRichExtText(self.bytes, pos)
             name = ret.baseText
             self.moveForward(bytesLen)
             self.names.append(name)
@@ -2167,7 +2168,7 @@ class Crn(BaseRecordHandler):
             elif typeId == 0x02:
                 # string
                 pos = self.getCurrentPos()
-                ret, length = globals.getUnicodeRichExtText(self.bytes[pos:])
+                ret, length = globals.getUnicodeRichExtText(self.bytes, pos)
                 text = ret.baseText
                 text = globals.encodeName(text)
                 self.moveForward(length)
diff --git a/msodumper/xlsstream.py b/msodumper/xlsstream.py
index debce12..945c5a8 100644
--- a/msodumper/xlsstream.py
+++ b/msodumper/xlsstream.py
@@ -449,12 +449,43 @@ class XLDirStream(object):
         bytes = self.readByteArray(size)
         return pos, header, size, bytes
 
-    def __getRecordHandler (self, header, size, bytes):
+    def __readRecAndContBytes(self):
+        '''Read record itself and possible CONTINUE blocks.'''
+
+        pos, header, size, bytes = self.__readRecordBytes()
+
+        # Records boundaries/offset list (only useful if there are
+        # CONTINUE records)
+        roflist = [size]
+        
+        # Read possible CONTINUE records, and concatenate the data
+        while self.peekNext() == 0x3c:
+            cpos, cheader, csize, cbytes = self.__readRecordBytes()
+            bytes += cbytes
+            size += csize
+            roflist.append(size)
+
+        return pos, header, size, bytes, roflist
+
+    def peekNext (self):
+        '''Check type of next record without changing stream state'''
+        
+        if self.size - self.pos < 4:
+            raise EndOfStream
+
+        pos = self.pos
+        header = self.readRaw(2)
+        if header == 0x0000:
+            raise EndOfStream
+        self.pos = pos
+        return header
+        
+    def __getRecordHandler (self, header, size, bytes, roflist):
         # record handler that parses the raw bytes and displays more
         # meaningful information.
         handler = None
         if recData.has_key(header) and len(recData[header]) >= 3:
-            handler = recData[header][2](header, size, bytes, self.strmData)
+            handler = recData[header][2](header, size, bytes, self.strmData, roflist)
 
         if handler != None and self.strmData.encrypted:
             # record handler exists.  Parse the record and display more info
@@ -470,8 +501,8 @@ class XLDirStream(object):
             self.strmData.encrypted = True
 
     def fillModel (self, model):
-        pos, header, size, bytes = self.__readRecordBytes()
-        handler = self.__getRecordHandler(header, size, bytes)
+        pos, header, size, bytes, roflist = self.__readRecAndContBytes()
+        handler = self.__getRecordHandler(header, size, bytes, roflist)
         if handler != None:
             try:
                 handler.fillModel(model)
@@ -483,11 +514,11 @@ class XLDirStream(object):
 
 
     def getNextRecordHandler (self):
-        pos, header, size, bytes = self.__readRecordBytes()
-        return self.__getRecordHandler(header, size, bytes)
+        pos, header, size, bytes, roflist = self.__readRecAndContBytes()
+        return self.__getRecordHandler(header, size, bytes, roflist)
 
     def readRecord (self):
-        pos, header, size, bytes = self.__readRecordBytes()
+        pos, header, size, bytes, roflist = self.__readRecAndContBytes()
 
         # record handler that parses the raw bytes and displays more
         # meaningful information.
@@ -500,12 +531,12 @@ class XLDirStream(object):
             print("%4.4Xh: %s - %s (%4.4Xh)"%
                   (header, recData[header][0], recData[header][1], header))
             if len(recData[header]) >= 3:
-                handler = recData[header][2](header, size, bytes, self.strmData)
+                handler = recData[header][2](header, size, bytes, self.strmData, roflist)
         elif self.type == DirType.RevisionLog and recDataRev.has_key(header):
             print("%4.4Xh: %s - %s (%4.4Xh)"%
                   (header, recDataRev[header][0], recDataRev[header][1], header))
             if len(recDataRev[header]) >= 3:
-                handler = recDataRev[header][2](header, size, bytes, self.strmData)
+                handler = recDataRev[header][2](header, size, bytes, self.strmData, roflist)
         else:
             print("%4.4Xh: [unknown record name] (%4.4Xh)"%(header, header))
 
commit 23d56056390cb8146ce36deff3c608e37453130a
Author: Jean-Francois Dockes <jf at dockes.org>
Date:   Wed Jan 8 15:49:39 2014 +0100

    Prevent possible infinite loop on bad input

diff --git a/msodumper/ole.py b/msodumper/ole.py
index dc284fb..16f3cb9 100644
--- a/msodumper/ole.py
+++ b/msodumper/ole.py
@@ -203,6 +203,8 @@ class Header(object):
             # additional sectors are used to store more SAT sector IDs.
             secID = self.__secIDFirstMSAT
             size = self.getSectorSize()
+            if size < 4:
+                raise Exception("ole.Header::parse: got %d as sector size!" % size)
             inLoop = True
             while inLoop:
                 pos = 512 + secID*size