[Libreoffice-commits] mso-dumper.git: 6 commits - src/docdirstream.py src/docrecord.py src/docstream.py src/globals.py
Miklos Vajna
vmiklos at kemper.freedesktop.org
Sun Aug 18 08:13:47 PDT 2013
src/docdirstream.py | 9 ++++++---
src/docrecord.py | 40 ++++++++++++++++++++++++++++++++--------
src/docstream.py | 9 +++++++--
src/globals.py | 7 +++++--
4 files changed, 50 insertions(+), 15 deletions(-)
New commits:
commit 806678b9a755eb7f304a855c1efe23e86901cea4
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sun Aug 18 16:07:54 2013 +0200
dump grfhic
diff --git a/src/docrecord.py b/src/docrecord.py
index a0d6cd4..61be2b4 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -3405,6 +3405,28 @@ class PlcfGram(DOCDirStream, PLC):
print '</aCP>'
print '</plcfGram>'
+class Grfhic(DOCDirStream):
+ """The grfhic structure is a set of HTML incompatibility flags that specify
+ the HTML incompatibilities of a list structure."""
+ def __init__(self, parent):
+ DOCDirStream.__init__(self, parent.bytes)
+ self.pos = parent.pos
+ self.parent = parent
+
+ def dump(self):
+ print '<grfhic type="grfhic">'
+ buf = self.readuInt8()
+ self.printAndSet("fhicChecked", self.getBit(buf, 0))
+ self.printAndSet("fhicFormat", self.getBit(buf, 1))
+ self.printAndSet("fhicListText", self.getBit(buf, 2))
+ self.printAndSet("fhicPeriod", self.getBit(buf, 3))
+ self.printAndSet("fhicLeft1", self.getBit(buf, 4))
+ self.printAndSet("fhicListTab", self.getBit(buf, 5))
+ self.printAndSet("unused", self.getBit(buf, 6))
+ self.printAndSet("fhicBullet", self.getBit(buf, 7))
+ self.parent.pos = self.pos
+ print '</grfhic>'
+
class LSTF(DOCDirStream):
"""The LSTF structure contains formatting properties that apply to an entire list."""
def __init__(self, plfLst, index):
@@ -3426,7 +3448,7 @@ class LSTF(DOCDirStream):
self.printAndSet("unused2", self.getBit(buf, 3))
self.printAndSet("fHybrid", self.getBit(buf, 4))
self.printAndSet("reserved1", (buf & 0xe0) >> 5) # 6..8th bits
- self.printAndSet("grfhic", self.readuInt8()) # TODO dump grfhic
+ Grfhic(self).dump()
print '</lstf>'
class LVLF(DOCDirStream):
@@ -3455,7 +3477,7 @@ class LVLF(DOCDirStream):
self.printAndSet("cbGrpprlChpx", self.readuInt8())
self.printAndSet("cbGrpprlPapx", self.readuInt8())
self.printAndSet("ilvlRestartLim", self.readuInt8())
- self.printAndSet("grfhic", self.readuInt8()) # TODO dump grfhic
+ Grfhic(self).dump()
print '</lvlf>'
class LVL(DOCDirStream):
@@ -3531,7 +3553,7 @@ class LFO(DOCDirStream):
self.printAndSet("unused2", self.readuInt32())
self.printAndSet("clfolvl", self.readuInt8())
self.printAndSet("ibstFltAutoNum", self.readuInt8())
- self.printAndSet("grfhic", self.readuInt8()) # TODO dump grfhic
+ Grfhic(self).dump()
self.printAndSet("unused3", self.readuInt8())
print '</lfo>'
commit 7ebcbd32d92043bcc998e6dca2067ea86d5d9934
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sun Aug 18 15:58:25 2013 +0200
the spec says lcbPlcfLvcPre10 should be ignored as well
diff --git a/src/docstream.py b/src/docstream.py
index 22695d4..89cd0d7 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -520,7 +520,7 @@ class WordDocumentStream(DOCDirStream):
value = self.readInt32()
hasHandler = len(i) > 1
# the spec says these must be ignored
- needsIgnoring = ["lcbStshfOrig", "lcbPlcfBteLvc"]
+ needsIgnoring = ["lcbStshfOrig", "lcbPlcfBteLvc", "lcbPlcfLvcPre10"]
# a member needs handling if it defines the size of a struct and it's non-zero
needsHandling = i[0].startswith("lcb") and value != 0 and (not i[0] in needsIgnoring)
self.printAndSet(i[0], value, end = ((not hasHandler) and (not needsHandling)), offset = True)
commit dea1e63ac3e62ca67192d39177c1de26fb15fd77
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sun Aug 18 15:38:58 2013 +0200
WordDocumentStream: don't throw on invalid utf16
Just print a warning instead. ooo101417-1.doc triggers this.
diff --git a/src/docdirstream.py b/src/docdirstream.py
index 9888638..5cf1493 100644
--- a/src/docdirstream.py
+++ b/src/docdirstream.py
@@ -134,7 +134,7 @@ class DOCDirStream:
else:
break
count += 1
- return globals.getUTF8FromUTF16("".join(map(lambda x: chr(x), bytes)))
+ return globals.getUTF8FromUTF16("".join(map(lambda x: chr(x), bytes)), xml = True)
def getBit(self, byte, bitNumber):
return (byte & (1 << bitNumber)) >> bitNumber
diff --git a/src/docstream.py b/src/docstream.py
index 398207c..22695d4 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -948,7 +948,12 @@ class WordDocumentStream(DOCDirStream):
if compressed:
return globals.encodeName(self.bytes[pos])
else:
- return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16'), lowOnly = True)
+ try:
+ return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16'), lowOnly = True)
+ except UnicodeDecodeError:
+ reason = 'could not decode bytes in position %d-%d (%s-%s)' % (pos, pos+1, hex(ord(self.bytes[pos])), hex(ord(self.bytes[pos+1])))
+ print '<todo what="WordDocumentStream::retrieveCP(): %s"/>' % reason
+ return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16', errors="replace"), lowOnly = True)
def retrieveCPs(self, start, end):
"""Retrieves a range of characters."""
diff --git a/src/globals.py b/src/globals.py
index 68aae93..31e995a 100644
--- a/src/globals.py
+++ b/src/globals.py
@@ -412,7 +412,7 @@ def getDouble (bytes):
return struct.unpack('<d', text)[0]
-def getUTF8FromUTF16 (bytes):
+def getUTF8FromUTF16 (bytes, xml = False):
# little endian utf-16 strings
byteCount = len(bytes)
loopCount = int(byteCount/2)
@@ -431,7 +431,10 @@ def getUTF8FromUTF16 (bytes):
try:
text += unicode(code, 'utf-8')
except UnicodeDecodeError:
- text += "<%d invalid chars>"%len(code)
+ close = ""
+ if xml:
+ close="/"
+ text += "<%d invalid chars%s>"%(len(code), close)
return text
class StreamWrap(object):
commit 4031eb1e626a9c66c0c306708b6ea96bbfb468f6
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sun Aug 18 15:02:58 2013 +0200
TCGRF: tolerate undocumented TextFlow
As seen in ooo100632-2.doc.
diff --git a/src/docdirstream.py b/src/docdirstream.py
index c3e4207..9888638 100644
--- a/src/docdirstream.py
+++ b/src/docdirstream.py
@@ -21,13 +21,16 @@ class DOCDirStream:
self.mainStream = mainStream
self.doc = doc
- def printAndSet(self, key, value, hexdump = True, end = True, offset = False, silent = False, dict = None):
+ def printAndSet(self, key, value, hexdump = True, end = True, offset = False, silent = False, dict = None, default = None):
setattr(self, key, value)
if silent:
return
attrs = ""
if dict:
- attrs += ' name="%s"' % dict[value]
+ if value in dict or not default:
+ attrs += ' name="%s"' % dict[value]
+ else:
+ attrs += ' name="%s"' % default
if hexdump:
value = hex(value)
if offset:
diff --git a/src/docrecord.py b/src/docrecord.py
index a9d99db..a0d6cd4 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -1072,7 +1072,7 @@ class TCGRF(DOCDirStream):
print '<tcgrf type="TCGRF" offset="%d">' % self.pos
buf = self.readuInt16()
self.printAndSet("horzMerge", buf & 0x0003) # 1..2nd bits
- self.printAndSet("textFlow", (buf & 0x001c) >> 2, dict = TextFlow) # 3..6th bits
+ self.printAndSet("textFlow", (buf & 0x001c) >> 2, dict = TextFlow, default = "todo") # 3..6th bits
self.printAndSet("vertMerge", (buf & 0x0060) >> 6, dict = VerticalMergeFlag) # 7..8th bits
self.printAndSet("vertAlign", (buf & 0x0180) >> 8, dict = VerticalAlign) # 9..10th bits
self.printAndSet("ftsWidth", (buf & 0x0e00) >> 10, dict = Fts) # 11..12th bits
commit e8fd0762c67fab15929288719015c9af5b57fb4c
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sun Aug 18 13:01:28 2013 +0200
PICFAndofficeArtData: don't throw on MM_SHAPEFILE
fdo54551-1.doc triggered this, with the change it only properly prints a
TODO.
diff --git a/src/docrecord.py b/src/docrecord.py
index 66ff07f..a9d99db 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -1019,8 +1019,9 @@ class PICFAndOfficeArtData(DOCDirStream):
assert self.pos == pos + 68
if picf.mfpf.mm == 0x0066:
print '<todo what="PICFAndOfficeArtData::dump(): picf.mfpf.mm == MM_SHAPEFILE is unhandled"/>'
- remaining = picf.lcb - (self.pos - pos)
- msodraw.InlineSpContainer(self, remaining).dumpXml(self, getWordModel(self.parent.mainStream))
+ else:
+ remaining = picf.lcb - (self.pos - pos)
+ msodraw.InlineSpContainer(self, remaining).dumpXml(self, getWordModel(self.parent.mainStream))
else:
print '<todo what="PICFAndOfficeArtData::dump(): handle sprmCFData or sprmCFOle2"/>'
print '</PICFAndOfficeArtData>'
commit 8f43c92520fb01066e7c287eaa2ba69b9ebe74fe
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sun Aug 18 11:51:35 2013 +0200
PICFAndOfficeArtData: blacklist sprmCFOle2 as well
According to the spec, this should not occur with a 0x01 placeholder
char, but fdo48097-1.doc has it.
diff --git a/src/docrecord.py b/src/docrecord.py
index e424e65..66ff07f 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -1009,8 +1009,9 @@ class PICFAndOfficeArtData(DOCDirStream):
print '<PICFAndOfficeArtData>'
found = False
for prl in self.parent.parent.parent.prls:
- if prl.sprm.sprm == 0x0806: # sprmCFData
+ if prl.sprm.sprm in (0x0806, 0x080a): # sprmCFData, sprmCFOle2
found = True
+ break
if not found:
pos = self.pos
picf = PICF(self)
@@ -1021,7 +1022,7 @@ class PICFAndOfficeArtData(DOCDirStream):
remaining = picf.lcb - (self.pos - pos)
msodraw.InlineSpContainer(self, remaining).dumpXml(self, getWordModel(self.parent.mainStream))
else:
- print '<todo what="PICFAndOfficeArtData::dump(): handle sprmCFData"/>'
+ print '<todo what="PICFAndOfficeArtData::dump(): handle sprmCFData or sprmCFOle2"/>'
print '</PICFAndOfficeArtData>'
# The TextFlow enumeration specifies the rotation settings for a block of text and for the individual
More information about the Libreoffice-commits
mailing list