[Libreoffice-commits] mso-dumper.git: 2 commits - src/docrecord.py src/docstream.py
Miklos Vajna
vmiklos at kemper.freedesktop.org
Thu May 23 03:14:46 PDT 2013
src/docrecord.py | 54 +++++++++++++++++++++++++++---------------------------
src/docstream.py | 23 ++++++++++++++---------
2 files changed, 41 insertions(+), 36 deletions(-)
New commits:
commit 953c2e65c059c8542b977d7a42fccf26b0397e66
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Thu May 23 12:13:17 2013 +0200
WordDocumentStream::__cpToOffset: use binary search
before:
$ time ./doc-dump.py fdo39958-1.doc > out.xml
real 11m51.771s
user 11m49.455s
sys 0m0.099s
after:
$ time ./doc-dump.py fdo39958-1.doc > out.xml
real 0m42.294s
user 0m42.023s
sys 0m0.058s
diff --git a/src/docstream.py b/src/docstream.py
index d147b82..cc0766b 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -12,6 +12,7 @@ from docdirstream import DOCDirStream
import docrecord
import globals
import sys
+import bisect
class DOCFile:
"""Represents the whole word file - feed will all bytes."""
@@ -932,9 +933,7 @@ class WordDocumentStream(DOCDirStream):
def __cpToOffset(self, cp):
"""Implements 2.4.1 Retrieving Text."""
plcPcd = self.clx.pcdt.plcPcd
- for i in range(len(plcPcd.aCp)):
- if plcPcd.aCp[i] <= cp:
- index = i
+ index = bisect.bisect_right(plcPcd.aCp, cp) - 1
aPcd = plcPcd.aPcd[index]
fcCompressed = aPcd.fc
if fcCompressed.fCompressed == 1:
commit 3d77a214bf257e79f5fcbe09079c139d67100069
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Wed May 22 21:44:50 2013 +0200
doc: handle fWhichTblStm != 1
fdo37057-1.doc is a reproducer for this.
diff --git a/src/docrecord.py b/src/docrecord.py
index 570269d..a14f81e 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -99,7 +99,7 @@ class FBKF(DOCDirStream):
class PlcfBkf(DOCDirStream, PLC):
"""A PLCFBKF is a PLC whose data elements are FBKF structures."""
def __init__(self, mainStream, offset, size):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
PLC.__init__(self, size, 4) # 4 is defined by 2.8.10
self.pos = offset
self.size = size
@@ -152,7 +152,7 @@ class Fld(DOCDirStream):
class PlcFld(DOCDirStream, PLC):
"""The Plcfld structure specifies the location of fields in the document."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
PLC.__init__(self, mainStream.lcbPlcfFldMom, 2) # 2 is defined by 2.8.25
self.pos = mainStream.fcPlcfFldMom
self.size = mainStream.lcbPlcfFldMom
@@ -184,7 +184,7 @@ class PlcFld(DOCDirStream, PLC):
class PlcfBkl(DOCDirStream, PLC):
"""The Plcfbkl structure is a PLC that contains only CPs and no additional data."""
def __init__(self, mainStream, offset, size, start):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
PLC.__init__(self, size, 0) # 0 is defined by 2.8.12
self.pos = offset
self.size = size
@@ -273,7 +273,7 @@ class Sed(DOCDirStream):
class PlcfSed(DOCDirStream, PLC):
"""The PlcfSed structure is a PLC structure where the data elements are Sed structures."""
def __init__(self, mainStream, offset, size):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
PLC.__init__(self, size, Sed.size)
self.pos = offset
self.size = size
@@ -299,7 +299,7 @@ class PlcfSed(DOCDirStream, PLC):
class Tcg(DOCDirStream):
"""The Tcg structure specifies command-related customizations."""
def __init__(self, mainStream, offset, size):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes)
self.pos = offset
self.size = size
@@ -339,7 +339,7 @@ class Sty(DOCDirStream):
class Selsf(DOCDirStream):
"""The Selsf structure specifies the last selection that was made to the document."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes)
self.pos = mainStream.fcWss
self.size = mainStream.lcbWss
self.mainStream = mainStream
@@ -1011,7 +1011,7 @@ class PnFkpPapx(DOCDirStream):
class PlcBteChpx(DOCDirStream, PLC):
"""The PlcBteChpx structure is a PLC that maps the offsets of text in the WordDocument stream to the character properties of that text."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream)
PLC.__init__(self, mainStream.lcbPlcfBteChpx, 4)
self.pos = mainStream.fcPlcfBteChpx
self.size = mainStream.lcbPlcfBteChpx
@@ -1036,7 +1036,7 @@ class PlcfHdd(DOCDirStream, PLC):
"""The Plcfhdd structure is a PLC that contains only CPs and no additional data. It specifies where
header document stories begin and end."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream)
PLC.__init__(self, mainStream.lcbPlcfHdd, 0)
self.pos = mainStream.fcPlcfHdd
self.size = mainStream.lcbPlcfHdd
@@ -1081,7 +1081,7 @@ class PlcfHdd(DOCDirStream, PLC):
class PlcfandTxt(DOCDirStream, PLC):
"""The PlcfandTxt structure is a PLC that contains only CPs and no additional data."""
def __init__(self, mainStream, offset, size):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream)
PLC.__init__(self, size, 0)
self.pos = offset
self.size = size
@@ -1102,7 +1102,7 @@ class PlcfandTxt(DOCDirStream, PLC):
class PlcfandRef(DOCDirStream, PLC):
"""The PlcfandRef structure is a PLC whose data elements are ATRDPre10 structures."""
def __init__(self, mainStream, offset, size):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream)
PLC.__init__(self, size, 30)
self.pos = offset
self.size = size
@@ -1881,7 +1881,7 @@ class Dop2007(DOCDirStream):
class RC4EncryptionHeader(DOCDirStream):
"""The encryption header structure used for RC4 encryption."""
def __init__(self, fib, pos, size):
- DOCDirStream.__init__(self, fib.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, fib.getTableStream().bytes)
self.fib = fib
self.pos = pos
self.size = size
@@ -1900,7 +1900,7 @@ class RC4EncryptionHeader(DOCDirStream):
class Dop(DOCDirStream):
"""The Dop structure contains the document and compatibility settings for the document."""
def __init__(self, fib):
- DOCDirStream.__init__(self, fib.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, fib.getTableStream().bytes)
self.pos = fib.fcDop
self.size = fib.lcbDop
self.fib = fib
@@ -2006,7 +2006,7 @@ class SttbfFfn(DOCDirStream):
class GrpXstAtnOwners(DOCDirStream):
"""This array contains the names of authors of comments in the document."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes)
self.pos = mainStream.fcGrpXstAtnOwners
self.size = mainStream.lcbGrpXstAtnOwners
self.mainStream = mainStream
@@ -2023,7 +2023,7 @@ class GrpXstAtnOwners(DOCDirStream):
class SttbfAssoc(DOCDirStream):
"""The SttbfAssoc structure is an STTB that contains strings which are associated with this document."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes)
self.pos = mainStream.fcSttbfAssoc
self.size = mainStream.lcbSttbfAssoc
self.mainStream = mainStream
@@ -2069,7 +2069,7 @@ class SttbfAssoc(DOCDirStream):
class SttbfRMark(DOCDirStream):
"""The SttbfRMark structure is an STTB structure where the strings specify the names of the authors of the revision marks, comments, and e-mail messages in the document."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes)
self.pos = mainStream.fcSttbfRMark
self.size = mainStream.lcbSttbfRMark
self.mainStream = mainStream
@@ -2105,7 +2105,7 @@ class OfficeArtWordDrawing(DOCDirStream):
class OfficeArtContent(DOCDirStream):
"""The OfficeArtContent structure specifies information about a drawing in the document."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes)
self.pos = mainStream.fcDggInfo
self.size = mainStream.lcbDggInfo
self.mainStream = mainStream
@@ -2140,7 +2140,7 @@ class ATNBE(DOCDirStream):
class SttbfAtnBkmk(DOCDirStream):
"""The SttbfAtnBkmk structure is an STTB whose strings are all of zero length."""
def __init__(self, mainStream, offset, size):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes)
self.pos = offset
self.size = size
@@ -2650,7 +2650,7 @@ class SPLS(DOCDirStream):
class PlcfSpl(DOCDirStream, PLC):
"""The Plcfspl structure is a Plc structure whose data elements are SpellingSpls structures."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
PLC.__init__(self, mainStream.lcbPlcfSpl, 2) # 2 is defined by 2.8.28
self.pos = mainStream.fcPlcfSpl
self.size = mainStream.lcbPlcfSpl
@@ -2732,7 +2732,7 @@ class FTXBXS(DOCDirStream):
class PlcftxbxTxt(DOCDirStream, PLC):
"""Specifies which ranges of text are contained in which textboxes."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
PLC.__init__(self, mainStream.lcbPlcftxbxTxt, FTXBXS.size)
self.pos = mainStream.fcPlcftxbxTxt
self.size = mainStream.lcbPlcftxbxTxt
@@ -2781,7 +2781,7 @@ class Tbkd(DOCDirStream):
class PlcftxbxBkd(DOCDirStream, PLC):
"""Specifies which ranges of text go inside which textboxes."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
PLC.__init__(self, mainStream.lcbPlcfTxbxBkd, 6)
self.pos = mainStream.fcPlcfTxbxBkd
self.size = mainStream.lcbPlcfTxbxBkd
@@ -2807,7 +2807,7 @@ class PlcfSpa(DOCDirStream, PLC):
"""The PlcfSpa structure is a PLC structure in which the data elements are
SPA structures."""
def __init__(self, mainStream, pos, size):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
PLC.__init__(self, size, 26) # 2.8.37
self.pos = pos
self.size = size
@@ -2822,7 +2822,7 @@ class PlcfSpa(DOCDirStream, PLC):
class PlcfGram(DOCDirStream, PLC):
"""The PlcfGram structure is a Plc structure whose data elements are GrammarSpls structures."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
PLC.__init__(self, mainStream.lcbPlcfGram, 2) # 2 is defined by 2.8.21
self.pos = mainStream.fcPlcfGram
self.size = mainStream.lcbPlcfGram
@@ -2936,7 +2936,7 @@ class LVL(DOCDirStream):
class PlfLst(DOCDirStream):
"""The PlfLst structure contains the list formatting information for the document."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
self.pos = mainStream.fcPlfLst
self.size = mainStream.lcbPlfLst
@@ -2992,7 +2992,7 @@ class LFOData(DOCDirStream):
class PlfLfo(DOCDirStream):
"""The PlfLfo structure contains the list format override data for the document."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream)
self.pos = mainStream.fcPlfLfo
self.size = mainStream.lcbPlfLfo
@@ -3014,7 +3014,7 @@ class PlfLfo(DOCDirStream):
class SttbListNames(DOCDirStream):
"""The SttbListNames structure is an STTB structure whose strings are the names used by the LISTNUM field."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream)
self.pos = mainStream.fcSttbListNames
self.size = mainStream.lcbSttbListNames
@@ -3035,7 +3035,7 @@ class SttbListNames(DOCDirStream):
class SttbSavedBy(DOCDirStream):
"""The SttbSavedBy structure is an STTB structure that specifies the save history of this document."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream)
self.pos = mainStream.fcSttbSavedBy
self.size = mainStream.lcbSttbSavedBy
@@ -3056,7 +3056,7 @@ class SttbSavedBy(DOCDirStream):
class SttbfBkmk(DOCDirStream):
"""The SttbfBkmk structure is an STTB structure whose strings specify the names of bookmarks."""
def __init__(self, mainStream):
- DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes)
+ DOCDirStream.__init__(self, mainStream.getTableStream().bytes)
self.pos = mainStream.fcSttbfBkmk
self.size = mainStream.lcbSttbfBkmk
self.mainStream = mainStream
diff --git a/src/docstream.py b/src/docstream.py
index 37db294..d147b82 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -50,7 +50,7 @@ class DOCFile:
def getStreamFromBytes(self, name, bytes):
if name == "WordDocument":
return WordDocumentStream(bytes, self.params, doc=self)
- if name == "1Table":
+ if name in ("0Table", "1Table"):
return TableStream(bytes, self.params, name, doc=self)
else:
return DOCDirStream(bytes, self.params, name, doc=self)
@@ -181,6 +181,12 @@ class WordDocumentStream(DOCDirStream):
self.printAndSet("lidThemeCS", self.readuInt16())
print '</%s>' % name
+ def getTableStream(self):
+ if self.fWhichTblStm:
+ return self.doc.getDirectoryStreamByName("1Table")
+ else:
+ return self.doc.getDirectoryStreamByName("0Table")
+
def dumpFibBase(self, name):
ret = True
print '<%s type="FibBase" size="32 bytes">' % name
@@ -217,7 +223,7 @@ class WordDocumentStream(DOCDirStream):
if self.fEncrypted == 1 and self.fObfuscated == 0:
self.printAndSet("lKey", self.readuInt32(), end = False)
print '<EncryptionVersionInfo>'
- tableStream = self.doc.getDirectoryStreamByName("1Table")
+ tableStream = self.getTableStream()
self.printAndSet("vMajor", tableStream.readuInt16())
self.printAndSet("vMinor", tableStream.readuInt16())
print '</EncryptionVersionInfo>'
@@ -531,7 +537,7 @@ class WordDocumentStream(DOCDirStream):
def handleLcbClx(self, silent = False):
offset = self.fcClx
size = self.lcbClx
- self.clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size)
+ self.clx = docrecord.Clx(self.getTableStream().bytes, self, offset, size)
if not silent:
self.clx.dump()
@@ -546,19 +552,19 @@ class WordDocumentStream(DOCDirStream):
def handleLcbPlcfBtePapx(self):
offset = self.fcPlcfBtePapx
size = self.lcbPlcfBtePapx
- plcBtePapx = docrecord.PlcBtePapx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size)
+ plcBtePapx = docrecord.PlcBtePapx(self.getTableStream().bytes, self, offset, size)
plcBtePapx.dump()
def handleLcbSttbfFfn(self):
offset = self.fcSttbfFfn
size = self.lcbSttbfFfn
- sttbfFfn = docrecord.SttbfFfn(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size)
+ sttbfFfn = docrecord.SttbfFfn(self.getTableStream().bytes, self, offset, size)
sttbfFfn.dump()
def handleLcbStshf(self):
offset = self.fcStshf
size = self.lcbStshf
- stsh = docrecord.STSH(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size)
+ stsh = docrecord.STSH(self.getTableStream().bytes, self, offset, size)
stsh.dump()
def handleLcbPlcfandTxt(self):
More information about the Libreoffice-commits
mailing list