[Libreoffice-commits] mso-dumper.git: 3 commits - msodumper/globals.py msodumper/node.py msodumper/ole.py msodumper/xlsrecord.py msodumper/xlsstream.py
Jean-Francois Dockes
jf at dockes.org
Thu Jan 9 10:43:18 PST 2014
msodumper/globals.py | 86 +++++++++++++++++++++++++++++++++++++++----------
msodumper/node.py | 9 +++--
msodumper/ole.py | 2 +
msodumper/xlsrecord.py | 40 ++++++++++++++--------
msodumper/xlsstream.py | 49 ++++++++++++++++++++++-----
5 files changed, 144 insertions(+), 42 deletions(-)
New commits:
commit 595542f5865f13aee61b161f468c4ee8b25cf6ad
Author: Jean-Francois Dockes <jf at dockes.org>
Date: Thu Jan 9 08:20:21 2014 +0100
Accept shorter BOF record
diff --git a/msodumper/xlsrecord.py b/msodumper/xlsrecord.py
index 0c55b60..ac070f4 100644
--- a/msodumper/xlsrecord.py
+++ b/msodumper/xlsrecord.py
@@ -627,16 +627,27 @@ class BOF(BaseRecordHandler):
self.buildYear = self.readUnsignedInt(2)
# file history flags
- self.flags = self.readUnsignedInt(4)
- self.win = (self.flags & 0x00000001)
- self.risc = (self.flags & 0x00000002)
- self.beta = (self.flags & 0x00000004)
- self.winAny = (self.flags & 0x00000008)
- self.macAny = (self.flags & 0x00000010)
- self.betaAny = (self.flags & 0x00000020)
- self.riscAny = (self.flags & 0x00000100)
- self.lowestExcelVer = self.readSignedInt(4)
-
+ try:
+ self.flags = self.readUnsignedInt(4)
+ self.win = (self.flags & 0x00000001)
+ self.risc = (self.flags & 0x00000002)
+ self.beta = (self.flags & 0x00000004)
+ self.winAny = (self.flags & 0x00000008)
+ self.macAny = (self.flags & 0x00000010)
+ self.betaAny = (self.flags & 0x00000020)
+ self.riscAny = (self.flags & 0x00000100)
+ self.lowestExcelVer = self.readSignedInt(4)
+ except:
+ self.flags = 0
+ self.win = 0
+ self.risc = 0
+ self.beta = 0
+ self.winAny = 0
+ self.macAny = 0
+ self.betaAny = 0
+ self.riscAny = 0
+ self.lowestExcelVer = 0
+
def parseBytes (self):
self.__parseBytes()
# BIFF version
commit 2d4f7cf511275a7a6103bce1d75963354e7e14fa
Author: Jean-Francois Dockes <jf at dockes.org>
Date: Wed Jan 8 17:20:29 2014 +0100
Process CONTINUE records by appending them to their base. Specially process
SST CONTINUE records by handling the string compression byte (grbit) at the
beginning of each CONTINUE record.
diff --git a/msodumper/globals.py b/msodumper/globals.py
index ca817ec..276a317 100644
--- a/msodumper/globals.py
+++ b/msodumper/globals.py
@@ -164,25 +164,55 @@ def encodeName (name, lowOnly = False, lowLimit = 0x20):
return newname
+# Uncompress "compressed" UTF-16. This compression strips high bytes
+# from a string when they are all 0. Just restore them.
+def uncompCompUnicode(bytes):
+ out = ""
+ for b in bytes:
+ out += b
+ out += '\0'
+ return out
class UnicodeRichExtText(object):
def __init__ (self):
- self.baseText = ''
+ self.baseText = unicode()
self.phoneticBytes = []
+# Linear search for index of first element in sorted list strictly
+# bigger than a given value. Might be converted to binary search, but our
+# lists (CONTINUE record offsets) are small. If the returned index is
+# the list size (last valid index+1), the input value is beyond the
+# max list value
+def find_first_bigger(ilist, value):
+ i = 0
+ while i < len(ilist) and value >= ilist[i]:
+ i +=1
+ return i
-def getUnicodeRichExtText (bytes):
+def getUnicodeRichExtText (bytes, offset = 0, rofflist = []):
+ if len(rofflist) == 0:
+ rofflist = [len(bytes)]
ret = UnicodeRichExtText()
# Avoid myriad of messages when in "catching" mode
if params.catchExceptions and (bytes is None or len(bytes) == 0):
return ret, 0
+
+ if len(rofflist) == 0 or rofflist[len(rofflist)-1] != len(bytes):
+ error("bad input to getUnicodeRichExtText: empty offset list or last offset != size. size %d list %s" % (len(bytes), str(rofflist)))
+ raise ByteStreamError()
+
strm = ByteStream(bytes)
+ strm.setCurrentPos(offset)
+
try:
textLen = strm.readUnsignedInt(2)
flags = strm.readUnsignedInt(1)
# 0 0 0 0 0 0 0 0
# |-------|D|C|B|A|
- isDoubleByte = (flags & 0x01) > 0 # A
+ if (flags & 0x01) > 0: # A
+ bytesPerChar = 2
+ else:
+ bytesPerChar = 1
ignored = (flags & 0x02) > 0 # B
hasPhonetic = (flags & 0x04) > 0 # C
isRichStr = (flags & 0x08) > 0 # D
@@ -195,18 +225,42 @@ def getUnicodeRichExtText (bytes):
if hasPhonetic:
phoneticBytes = strm.readUnsignedInt(4)
- if isDoubleByte:
- # double-byte string (UTF-16)
- ret.baseText = \
- unicode(strm.readBytes(2*textLen), 'UTF-16LE', errors='replace')
- else:
- # "Compressed Unicode" string. UTF-16 without the zero
- # octets. These have to be latin1
- if params.utf8:
- ret.baseText = strm.readBytes(textLen).decode('cp1252')
- else:
- # If utf8 is not set, we'll print hex bytes, keep data as is
- ret.baseText = strm.readBytes(textLen)
+ # Reading the string proper. This is made a bit more
+ # complicated by the fact that the format can switch from
+ # compressed (latin data with high zeros stripped) to normal
+ # (UTF-16LE) whenever a string encounters a CONTINUE record
+ # boundary. The new format is indicated by a single byte at
+ # the start of the CONTINUE record payload.
+ while textLen > 0:
+ #print("Reading Unicode with bytesPerChar %d" % bytesPerChar)
+ bytesToRead = textLen * bytesPerChar
+
+ # Truncate to next record boundary
+ ibound = find_first_bigger(rofflist, strm.getCurrentPos())
+ if ibound == len(rofflist):
+ # Just try to read and let the stream raise an exception
+ strm.readBytes(bytesToRead)
+ return
+
+ bytesToRead = min(bytesToRead, \
+ rofflist[ibound]- strm.getCurrentPos())
+ newdata = strm.readBytes(bytesToRead)
+ if bytesPerChar == 1:
+ newdata = uncompCompUnicode(newdata)
+
+ ret.baseText += unicode(newdata, 'UTF-16LE', errors='replace')
+
+ textLen -= bytesToRead // bytesPerChar
+
+ # If there is still data to read, we hit a record boundary. Read
+ # the grbit byte for detecting possible compression switch
+ if textLen > 0:
+ grbit = strm.readUnsignedInt(1)
+ if (grbit & 1) != 0:
+ bytesPerChar = 2
+ else:
+ bytesPerChar = 1
+
if isRichStr:
for i in xrange(0, numElem):
posChar = strm.readUnsignedInt(2)
@@ -219,7 +273,7 @@ def getUnicodeRichExtText (bytes):
raise
error("getUnicodeRichExtText: %s\n" % e)
return ret, len(bytes)
- return ret, strm.getCurrentPos()
+ return ret, strm.getCurrentPos() - offset
def getRichText (bytes, textLen=None):
diff --git a/msodumper/node.py b/msodumper/node.py
index abf67ca..bab92bd 100644
--- a/msodumper/node.py
+++ b/msodumper/node.py
@@ -128,15 +128,18 @@ def encodeString (sin, utf8 = False):
sout1 = sin.encode('UTF-8')
else:
sout1 = sin
- # Escape special characters as entities
+ # Escape special characters as entities. Can't keep zero bytes either
+ # (bad XML). They can only arrive here if there is a bug somewhere.
for c in sout1:
- if c in encodeTable:
+ if ord(c) == 0:
+ sout += '(nullbyte)'
+ elif c in encodeTable:
sout += '&' + encodeTable[c] + ';'
else:
sout += c
else:
for c in sin:
- if ord(c) >= 128:
+ if ord(c) >= 128 or ord(c) == 0:
# encode non-ascii ranges.
sout += "\\x%2.2x"%ord(c)
elif encodeTable.has_key(c):
diff --git a/msodumper/xlsrecord.py b/msodumper/xlsrecord.py
index 5fa3fc6..0c55b60 100644
--- a/msodumper/xlsrecord.py
+++ b/msodumper/xlsrecord.py
@@ -249,11 +249,12 @@ class DXFN12NoCB(object):
class BaseRecordHandler(globals.ByteStream):
- def __init__ (self, header, size, bytes, strmData):
+ def __init__ (self, header, size, bytes, strmData, roflist = []):
globals.ByteStream.__init__(self, bytes)
self.header = header
self.lines = []
self.strmData = strmData
+ self.roflist = roflist
def parseBytes (self):
"""Parse the original bytes and generate human readable output.
@@ -1599,7 +1600,7 @@ class SST(BaseRecordHandler):
self.strCount = self.readSignedInt(4) # total number of unique strings.
self.sharedStrings = []
for i in xrange(0, self.strCount):
- extText, bytesRead = globals.getUnicodeRichExtText(self.bytes[self.getCurrentPos():])
+ extText, bytesRead = globals.getUnicodeRichExtText(self.bytes, self.getCurrentPos(), self.roflist)
self.readBytes(bytesRead) # advance current position.
self.sharedStrings.append(extText)
@@ -1922,7 +1923,7 @@ class SupBook(BaseRecordHandler):
self.moveBack(2)
pos = self.getCurrentPos()
while pos < self.size:
- ret, bytesLen = globals.getUnicodeRichExtText(self.bytes[pos:])
+ ret, bytesLen = globals.getUnicodeRichExtText(self.bytes, pos)
name = ret.baseText
self.moveForward(bytesLen)
self.names.append(name)
@@ -2167,7 +2168,7 @@ class Crn(BaseRecordHandler):
elif typeId == 0x02:
# string
pos = self.getCurrentPos()
- ret, length = globals.getUnicodeRichExtText(self.bytes[pos:])
+ ret, length = globals.getUnicodeRichExtText(self.bytes, pos)
text = ret.baseText
text = globals.encodeName(text)
self.moveForward(length)
diff --git a/msodumper/xlsstream.py b/msodumper/xlsstream.py
index debce12..945c5a8 100644
--- a/msodumper/xlsstream.py
+++ b/msodumper/xlsstream.py
@@ -449,12 +449,43 @@ class XLDirStream(object):
bytes = self.readByteArray(size)
return pos, header, size, bytes
- def __getRecordHandler (self, header, size, bytes):
+ def __readRecAndContBytes(self):
+ '''Read record itself and possible CONTINUE blocks.'''
+
+ pos, header, size, bytes = self.__readRecordBytes()
+
+ # Records boundaries/offset list (only useful if there are
+ # CONTINUE records)
+ roflist = [size]
+
+ # Read possible CONTINUE records, and concatenate the data
+ while self.peekNext() == 0x3c:
+ cpos, cheader, csize, cbytes = self.__readRecordBytes()
+ bytes += cbytes
+ size += csize
+ roflist.append(size)
+
+ return pos, header, size, bytes, roflist
+
+ def peekNext (self):
+ '''Check type of next record without changing stream state'''
+
+ if self.size - self.pos < 4:
+ raise EndOfStream
+
+ pos = self.pos
+ header = self.readRaw(2)
+ if header == 0x0000:
+ raise EndOfStream
+ self.pos = pos
+ return header
+
+ def __getRecordHandler (self, header, size, bytes, roflist):
# record handler that parses the raw bytes and displays more
# meaningful information.
handler = None
if recData.has_key(header) and len(recData[header]) >= 3:
- handler = recData[header][2](header, size, bytes, self.strmData)
+ handler = recData[header][2](header, size, bytes, self.strmData, roflist)
if handler != None and self.strmData.encrypted:
# record handler exists. Parse the record and display more info
@@ -470,8 +501,8 @@ class XLDirStream(object):
self.strmData.encrypted = True
def fillModel (self, model):
- pos, header, size, bytes = self.__readRecordBytes()
- handler = self.__getRecordHandler(header, size, bytes)
+ pos, header, size, bytes, roflist = self.__readRecAndContBytes()
+ handler = self.__getRecordHandler(header, size, bytes, roflist)
if handler != None:
try:
handler.fillModel(model)
@@ -483,11 +514,11 @@ class XLDirStream(object):
def getNextRecordHandler (self):
- pos, header, size, bytes = self.__readRecordBytes()
- return self.__getRecordHandler(header, size, bytes)
+ pos, header, size, bytes, roflist = self.__readRecAndContBytes()
+ return self.__getRecordHandler(header, size, bytes, roflist)
def readRecord (self):
- pos, header, size, bytes = self.__readRecordBytes()
+ pos, header, size, bytes, roflist = self.__readRecAndContBytes()
# record handler that parses the raw bytes and displays more
# meaningful information.
@@ -500,12 +531,12 @@ class XLDirStream(object):
print("%4.4Xh: %s - %s (%4.4Xh)"%
(header, recData[header][0], recData[header][1], header))
if len(recData[header]) >= 3:
- handler = recData[header][2](header, size, bytes, self.strmData)
+ handler = recData[header][2](header, size, bytes, self.strmData, roflist)
elif self.type == DirType.RevisionLog and recDataRev.has_key(header):
print("%4.4Xh: %s - %s (%4.4Xh)"%
(header, recDataRev[header][0], recDataRev[header][1], header))
if len(recDataRev[header]) >= 3:
- handler = recDataRev[header][2](header, size, bytes, self.strmData)
+ handler = recDataRev[header][2](header, size, bytes, self.strmData, roflist)
else:
print("%4.4Xh: [unknown record name] (%4.4Xh)"%(header, header))
commit 23d56056390cb8146ce36deff3c608e37453130a
Author: Jean-Francois Dockes <jf at dockes.org>
Date: Wed Jan 8 15:49:39 2014 +0100
Prevent possible infinite loop on bad input
diff --git a/msodumper/ole.py b/msodumper/ole.py
index dc284fb..16f3cb9 100644
--- a/msodumper/ole.py
+++ b/msodumper/ole.py
@@ -203,6 +203,8 @@ class Header(object):
# additional sectors are used to store more SAT sector IDs.
secID = self.__secIDFirstMSAT
size = self.getSectorSize()
+ if size < 4:
+ raise Exception("ole.Header::parse: got %d as sector size!" % size)
inLoop = True
while inLoop:
pos = 512 + secID*size
More information about the Libreoffice-commits
mailing list