[Libreoffice-commits] mso-dumper.git: 3 commits - msodumper/formula.py msodumper/globals.py msodumper/node.py msodumper/xlsmodel.py msodumper/xlsrecord.py msodumper/xlsstream.py xls-dump.py

Sun Jan 5 16:12:59 PST 2014

msodumper/formula.py   |    9 +++-
 msodumper/globals.py   |  110 ++++++++++++++++++++++++++-----------------------
 msodumper/node.py      |   47 +++++++++++++-------
 msodumper/xlsmodel.py  |   17 ++++++-
 msodumper/xlsrecord.py |   37 +++++++++++-----
 msodumper/xlsstream.py |   21 +++++----
 xls-dump.py            |   14 ++++--
 7 files changed, 160 insertions(+), 95 deletions(-)

New commits:
commit 29ca3c8cc012317e9b2d579f7bda83191813d17b
Author: Jean-Francois Dockes <jf at dockes.org>
Date:   Fri Jan 3 14:28:57 2014 +0100

    Process strings as Python strings instead of array of integers. Add option to print UTF-8 strings instead of hex

diff --git a/msodumper/globals.py b/msodumper/globals.py
index 2b45299..ca817ec 100644
--- a/msodumper/globals.py
+++ b/msodumper/globals.py
@@ -35,7 +35,9 @@ class Params(object):
         self.dumpText = False
         self.dumpedIds = []
         self.noRawDump = False
-
+        self.catchExceptions = False
+        self.utf8 = False
+        
 # Global parameters / run configuration, to be set up by the main
 # program during initialization
 params = Params()
@@ -195,14 +197,16 @@ def getUnicodeRichExtText (bytes):
 
         if isDoubleByte:
             # double-byte string (UTF-16)
-            text = ''
-            for i in xrange(0, textLen):
-                text += toTextBytes(strm.readBytes(2)).decode('utf-16')
-            ret.baseText = text
+            ret.baseText = \
+                unicode(strm.readBytes(2*textLen), 'UTF-16LE', errors='replace')
         else:
-            # single-byte string
-            ret.baseText = toTextBytes(strm.readBytes(textLen))
-
+            # "Compressed Unicode" string. UTF-16 without the zero
+            # octets. These have to be latin1
+            if params.utf8:
+                ret.baseText = strm.readBytes(textLen).decode('cp1252')
+            else:
+                # If utf8 is not set, we'll print hex bytes, keep data as is
+                ret.baseText = strm.readBytes(textLen)
         if isRichStr:
             for i in xrange(0, numElem):
                 posChar = strm.readUnsignedInt(2)
@@ -254,11 +258,14 @@ Note the following:
     totalByteLen = strm.getCurrentPos() + textLen + extraBytes
     if is16Bit:
         totalByteLen += textLen # double the text length since each char is 2 bytes.
-        text = ''
-        for i in xrange(0, textLen):
-            text += toTextBytes(strm.readBytes(2)).decode('utf-16')
+        text = unicode(strm.readBytes(2*textLen), 'UTF-16LE', errors='replace')
     else:
-        text = toTextBytes(strm.readBytes(textLen))
+        if params.utf8:
+            # Compressed Unicode-> latin1
+            text = strm.readBytes(textLen).decode('cp1252')
+        else:
+            # Old behaviour with hex dump
+            text = strm.readBytes(textLen)
 
     return (text, totalByteLen)
 
@@ -344,20 +351,15 @@ def getRawBytes (bytes, spaced=True, reverse=True):
     return text
 
 
+# TBD: getTextBytes is now only called from pptrecord.
+# getTextBytes() and toTextBytes() are probably not
+# needed any more now that we store text as str not list.
+# toTextBytes() has been changed to do nothing until we're sure we can dump it
 def getTextBytes (bytes):
     return toTextBytes(bytes)
 
-
 def toTextBytes (bytes):
-    n = len(bytes)
-    text = ''
-    for i in xrange(0, n):
-        b = bytes[i]
-        if type(b) == type(0x00):
-            b = struct.pack('B', b)
-        text += b
-    return text
-
+    return bytes
 
 def getSignedInt (bytes):
     # little endian
diff --git a/msodumper/node.py b/msodumper/node.py
index c376ac6..abf67ca 100644
--- a/msodumper/node.py
+++ b/msodumper/node.py
@@ -119,17 +119,31 @@ encodeTable = {
     '\'': 'apos'
 }
 
-def encodeString (sin):
+# If utf8 is set, the input is either utf-8 bytes or Python
+# Unicode. Output utf-8 instead of hex-dump.
+def encodeString (sin, utf8 = False):
     sout = ''
-    for c in sin:
-        if ord(c) >= 128:
-            # encode non-ascii ranges.
-            sout += "\\x%2.2x"%ord(c)
-        elif encodeTable.has_key(c):
-            # encode html symbols.
-            sout += '&' + encodeTable[c] + ';'
+    if utf8:
+        if isinstance(sin, unicode):
+            sout1 = sin.encode('UTF-8')
         else:
-            sout += c
+            sout1 = sin
+        # Escape special characters as entities
+        for c in sout1:
+            if c in encodeTable:
+                sout += '&' + encodeTable[c] + ';'
+            else:
+                sout += c
+    else:
+        for c in sin:
+            if ord(c) >= 128:
+                # encode non-ascii ranges.
+                sout += "\\x%2.2x"%ord(c)
+            elif encodeTable.has_key(c):
+                # encode html symbols.
+                sout += '&' + encodeTable[c] + ';'
+            else:
+                sout += c
 
     return sout
 
@@ -146,10 +160,11 @@ def convertAttrValue (val):
 
     return val
 
-def prettyPrint (fd, node):
-    printNode(fd, node, 0, True)
+# If utf8 is set, the input is either utf-8 bytes or unicode
+def prettyPrint (fd, node, utf8 = False):
+    printNode(fd, node, 0, True, utf8 = utf8)
 
-def printNode (fd, node, level, breakLine):
+def printNode (fd, node, level, breakLine, utf8 = False):
     singleIndent = ''
     lf = ''
     if breakLine:
@@ -159,7 +174,7 @@ def printNode (fd, node, level, breakLine):
     if node.nodeType == NodeType.Root:
         # root node itself only contains child nodes.
         for child in node.getChildNodes():
-            printNode(fd, child, level, True)
+            printNode(fd, child, level, True, utf8 = utf8)
     elif node.nodeType == NodeType.Element:
         hasChildren = len(node.getChildNodes()) > 0
 
@@ -174,7 +189,7 @@ def printNode (fd, node, level, breakLine):
                 if val == None:
                     continue
                 val = convertAttrValue(val)
-                line += " " + key + '="' + encodeString(val) + '"'
+                line += " " + key + '="' + encodeString(val, utf8 = utf8) + '"'
 
         if hasChildren:
             breakChildren = breakLine and not node.hasContent()
@@ -183,7 +198,7 @@ def printNode (fd, node, level, breakLine):
                 line += "\n"
             fd.write (indent + line)
             for child in node.getChildNodes():
-                printNode(fd, child, level+1, breakChildren)
+                printNode(fd, child, level+1, breakChildren, utf8 = utf8)
             line = "</%s>%s"%(node.name, lf)
             if breakChildren:
                 line = indent + line
@@ -194,7 +209,7 @@ def printNode (fd, node, level, breakLine):
 
     elif node.nodeType == NodeType.Content:
         content = node.content
-        content = encodeString(content)
+        content = encodeString(content, utf8 = utf8)
         if len(content) > 0:
             fd.write (indent + content + lf)
 
diff --git a/msodumper/xlsstream.py b/msodumper/xlsstream.py
index a87cdbd..debce12 100644
--- a/msodumper/xlsstream.py
+++ b/msodumper/xlsstream.py
@@ -428,13 +428,11 @@ class XLDirStream(object):
         return bytes
 
     def readByteArray (self, size=1):
-        bytes = []
-        for i in xrange(0, size):
-            if self.pos >= self.size:
-                raise EndOfStream
-            bytes.append(ord(self.bytes[self.pos]))
-            self.pos += 1
-        return bytes
+        if self.pos + size >= self.size:
+            raise EndOfStream
+        curpos = self.pos
+        self.pos += size
+        return self.bytes[curpos:self.pos]
 
     def __printSep (self, c, w, prefix=''):
         print(prefix + c*w)
@@ -520,7 +518,7 @@ class XLDirStream(object):
         for i in xrange(0, size):
             if (i+1) % 16 == 1:
                 output("%4.4Xh: "%header)
-            output("%2.2X "%bytes[i])
+            output("%2.2X "% ord(bytes[i]))
             if (i+1) % 16 == 0 and i != size-1:
                 print("")
         if size > 0:
diff --git a/xls-dump.py b/xls-dump.py
index edd70a1..2f76996 100755
--- a/xls-dump.py
+++ b/xls-dump.py
@@ -73,7 +73,7 @@ class XLDumper(object):
             dirstrm = self.strm.getDirectoryStream(d)
             data = self.__readSubStreamXML(dirstrm)
             self.__dumpDataAsXML(data, root)
-        node.prettyPrint(sys.stdout, docroot)
+        node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
 
     def dumpCanonicalXML (self):
         self.__parseFile()
@@ -92,7 +92,7 @@ class XLDumper(object):
             wbmodel.encrypted = self.strmData.encrypted
             root.appendChild(wbmodel.createDOM())
 
-        node.prettyPrint(sys.stdout, docroot)
+        node.prettyPrint(sys.stdout, docroot, utf8 = self.params.utf8)
 
     def dump (self):
         self.__parseFile()
@@ -209,12 +209,15 @@ def main ():
         help="Specify the dump mode.  Possible values are: 'flat', 'xml', or 'canonical-xml'.  The default value is 'flat'.")
     parser.add_option("--catch", action="store_true", dest="catch_exceptions", default=False,
         help="Catch exceptions and try to continue.")
+    parser.add_option("--utf-8", action="store_true", dest="utf8", default=False,
+        help="Output strings as UTF-8.")
     options, args = parser.parse_args()
     params = globals.params
     params.debug = options.debug
     params.showSectorChain = options.show_sector_chain
     params.showStreamPos = options.show_stream_pos
     params.catchExceptions = options.catch_exceptions
+    params.utf8 = options.utf8
     
     if len(args) < 1:
         globals.error("takes at least one argument\n")
commit 63e06027085ddc1857ee21d2a111b6acf0110a90
Author: Jean-Francois Dockes <jf at dockes.org>
Date:   Fri Jan 3 14:15:00 2014 +0100

    have RK cell object populate the workbook model for later xml dumping

diff --git a/msodumper/xlsrecord.py b/msodumper/xlsrecord.py
index 9fb2641..5fa3fc6 100644
--- a/msodumper/xlsrecord.py
+++ b/msodumper/xlsrecord.py
@@ -1484,23 +1484,31 @@ class Protect(BaseRecordHandler):
 class RK(BaseRecordHandler):
     """Cell with encoded integer or floating-point value"""
 
-    def parseBytes (self):
-        row = globals.getSignedInt(self.bytes[0:2])
-        col = globals.getSignedInt(self.bytes[2:4])
-        xf  = globals.getSignedInt(self.bytes[4:6])
+    def __parseBytes (self):
+        self.row = globals.getSignedInt(self.bytes[0:2])
+        self.col = globals.getSignedInt(self.bytes[2:4])
+        self.xf  = globals.getSignedInt(self.bytes[4:6])
 
         rkval = globals.getSignedInt(self.bytes[6:10])
-        auxData = RKAuxData()
-        realVal = decodeRK(rkval, auxData)
+        self.auxData = RKAuxData()
+        self.realVal = decodeRK(rkval, self.auxData)
 
-        self.appendCellPosition(col, row)
-        self.appendLine("XF record ID: %d"%xf)
-        self.appendLine("multiplied by 100: %d"%auxData.multi100)
-        if auxData.signedInt:
+    def parseBytes (self):
+        self.__parseBytes()
+        self.appendCellPosition(self.col, self.row)
+        self.appendLine("XF record ID: %d"%self.xf)
+        self.appendLine("multiplied by 100: %d"%self.auxData.multi100)
+        if self.auxData.signedInt:
             self.appendLine("type: signed integer")
         else:
             self.appendLine("type: floating point")
-        self.appendLine("value: %g"%realVal)
+        self.appendLine("value: %g"%self.realVal)
+
+    def fillModel (self, model):
+        self.__parseBytes()
+        sheet = model.getCurrentSheet()
+        cell = xlsmodel.NumberCell(self.realVal)
+        sheet.setCell(self.col, self.row, cell)
 
 class Scl(BaseRecordHandler):
 
commit bb7ab93048ea5b910a749824ca65dff9dc381e02
Author: Jean-Francois Dockes <jf at dockes.org>
Date:   Fri Dec 20 13:38:28 2013 +0100

    Add option to catch exceptions and try to continue. Make params a global variable.
    
    With the option not set, the output was tested identical to the previous
    version in both xml and raw output modes, on more than 200 random xls
    files harvested from the web.

diff --git a/msodumper/formula.py b/msodumper/formula.py
index 9c16b56..abb17b7 100644
--- a/msodumper/formula.py
+++ b/msodumper/formula.py
@@ -684,7 +684,14 @@ associated token classes will be without the leading underscore (_)."""
     def __init__ (self, header, bytes):
         self.header = header
         self.tokens = []
-        self.strm = globals.ByteStream(bytes)
+        try:
+            # We are sometimes called with None bytes
+            self.strm = globals.ByteStream(bytes)
+        except:
+            if not globals.params.catchExceptions:
+                raise
+            globals.error("FormulaParser: init called with None source\n")
+            self.strm = globals.ByteStream("")
 
     def parse (self):
         while not self.strm.isEndOfRecord():
diff --git a/msodumper/globals.py b/msodumper/globals.py
index c5d89d8..2b45299 100644
--- a/msodumper/globals.py
+++ b/msodumper/globals.py
@@ -171,42 +171,50 @@ class UnicodeRichExtText(object):
 
 def getUnicodeRichExtText (bytes):
     ret = UnicodeRichExtText()
+    # Avoid myriad of messages when in "catching" mode
+    if params.catchExceptions and (bytes is None or len(bytes) == 0):
+        return ret, 0
     strm = ByteStream(bytes)
-    textLen = strm.readUnsignedInt(2)
-    flags = strm.readUnsignedInt(1)
-    #  0 0 0 0 0 0 0 0
-    # |-------|D|C|B|A|
-    isDoubleByte = (flags & 0x01) > 0 # A
-    ignored      = (flags & 0x02) > 0 # B
-    hasPhonetic  = (flags & 0x04) > 0 # C
-    isRichStr    = (flags & 0x08) > 0 # D
-
-    numElem = 0
-    if isRichStr:
-        numElem = strm.readUnsignedInt(2)
-
-    phoneticBytes = 0
-    if hasPhonetic:
-        phoneticBytes = strm.readUnsignedInt(4)
-
-    if isDoubleByte:
-        # double-byte string (UTF-16)
-        text = ''
-        for i in xrange(0, textLen):
-            text += toTextBytes(strm.readBytes(2)).decode('utf-16')
-        ret.baseText = text
-    else:
-        # single-byte string
-        ret.baseText = toTextBytes(strm.readBytes(textLen))
-
-    if isRichStr:
-        for i in xrange(0, numElem):
-            posChar = strm.readUnsignedInt(2)
-            fontIdx = strm.readUnsignedInt(2)
-
-    if hasPhonetic:
-        ret.phoneticBytes = strm.readBytes(phoneticBytes)
-
+    try:
+        textLen = strm.readUnsignedInt(2)
+        flags = strm.readUnsignedInt(1)
+        #  0 0 0 0 0 0 0 0
+        # |-------|D|C|B|A|
+        isDoubleByte = (flags & 0x01) > 0 # A
+        ignored      = (flags & 0x02) > 0 # B
+        hasPhonetic  = (flags & 0x04) > 0 # C
+        isRichStr    = (flags & 0x08) > 0 # D
+
+        numElem = 0
+        if isRichStr:
+            numElem = strm.readUnsignedInt(2)
+
+        phoneticBytes = 0
+        if hasPhonetic:
+            phoneticBytes = strm.readUnsignedInt(4)
+
+        if isDoubleByte:
+            # double-byte string (UTF-16)
+            text = ''
+            for i in xrange(0, textLen):
+                text += toTextBytes(strm.readBytes(2)).decode('utf-16')
+            ret.baseText = text
+        else:
+            # single-byte string
+            ret.baseText = toTextBytes(strm.readBytes(textLen))
+
+        if isRichStr:
+            for i in xrange(0, numElem):
+                posChar = strm.readUnsignedInt(2)
+                fontIdx = strm.readUnsignedInt(2)
+
+        if hasPhonetic:
+            ret.phoneticBytes = strm.readBytes(phoneticBytes)
+    except Exception as e:
+        if not params.catchExceptions:
+            raise
+        error("getUnicodeRichExtText: %s\n" % e)
+        return ret, len(bytes)
     return ret, strm.getCurrentPos()
 
 
diff --git a/msodumper/xlsmodel.py b/msodumper/xlsmodel.py
index c2bb890..8332e32 100644
--- a/msodumper/xlsmodel.py
+++ b/msodumper/xlsmodel.py
@@ -381,7 +381,13 @@ class Worksheet(SheetBase):
             nd.setAttr('first-defined-cell', self.__firstDefinedCell.getName())
 
         if self.__firstFreeCell != None:
-            nd.setAttr('first-free-cell', self.__firstFreeCell.getName())
+            try:
+                nd.setAttr('first-free-cell', self.__firstFreeCell.getName())
+            except Exception as e:
+                if not globals.params.catchExceptions:
+                    raise
+                globals.error("createDOM: trying set firstFreeCell: %s\n" % e)
+                pass
 
         self.__appendAutoFilterNode(wb, nd) # autofilter (if exists)
         self.__appendHiddenRowsNode(wb, nd) # hidden rows
@@ -521,8 +527,13 @@ class FormulaCell(CellBase):
         nd = node.Element('formula-cell')
         if self.tokens != None:
             parser = formula.FormulaParser(None, self.tokens)
-            parser.parse()
-            nd.setAttr('formula', parser.getText())
+            try:
+                parser.parse()
+                nd.setAttr('formula', parser.getText())
+            except:
+                if not globals.params.catchExceptions:
+                    raise
+                pass
             s = globals.getRawBytes(self.tokens, True, False)
             nd.setAttr('token-bytes', s)
             if self.cachedResult != None:
diff --git a/msodumper/xlsrecord.py b/msodumper/xlsrecord.py
index fdacf85..9fb2641 100644
--- a/msodumper/xlsrecord.py
+++ b/msodumper/xlsrecord.py
@@ -285,7 +285,12 @@ Like parseBytes(), the derived classes must overwrite this method."""
         try:
             self.parseBytes()
             for line in self.lines:
-                print (headerStr + line)
+                try:
+                    print (headerStr + line)
+                except:
+                    if not globals.params.catchExceptions:
+                        raise
+                    print (headerStr + "(xlsrecord:unprintable)")
         except globals.ByteStreamError:
             print(headerStr + "Error interpreting the record!")
 
diff --git a/msodumper/xlsstream.py b/msodumper/xlsstream.py
index b3b396f..a87cdbd 100644
--- a/msodumper/xlsstream.py
+++ b/msodumper/xlsstream.py
@@ -475,7 +475,12 @@ class XLDirStream(object):
         pos, header, size, bytes = self.__readRecordBytes()
         handler = self.__getRecordHandler(header, size, bytes)
         if handler != None:
-            handler.fillModel(model)
+            try:
+                handler.fillModel(model)
+            except Exception as e:
+                if not globals.params.catchExceptions:
+                    raise
+                globals.error("XLDirStream:fillModel: %s\n" % e)
         self.__postReadRecord(header)
 
 
diff --git a/xls-dump.py b/xls-dump.py
index 9ca6755..edd70a1 100755
--- a/xls-dump.py
+++ b/xls-dump.py
@@ -207,12 +207,15 @@ def main ():
         help="Show the position of each record relative to the stream.")
     parser.add_option("--dump-mode", dest="dump_mode", default="flat", metavar="MODE",
         help="Specify the dump mode.  Possible values are: 'flat', 'xml', or 'canonical-xml'.  The default value is 'flat'.")
+    parser.add_option("--catch", action="store_true", dest="catch_exceptions", default=False,
+        help="Catch exceptions and try to continue.")
     options, args = parser.parse_args()
-    params = globals.Params()
+    params = globals.params
     params.debug = options.debug
     params.showSectorChain = options.show_sector_chain
     params.showStreamPos = options.show_stream_pos
-
+    params.catchExceptions = options.catch_exceptions
+    
     if len(args) < 1:
         globals.error("takes at least one argument\n")
         parser.print_help()