[Libreoffice-commits] .: 9 commits - src/docrecord.py src/docstream.py
Miklos Vajna
vmiklos at kemper.freedesktop.org
Sat Jan 5 13:46:02 PST 2013
src/docrecord.py | 116 ++++++++++++++++++++++++++++++++++++-------------------
src/docstream.py | 35 +++++++++++++---
2 files changed, 106 insertions(+), 45 deletions(-)
New commits:
commit 123b9721a4b19f469051696b969542b961261392
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat Jan 5 22:45:47 2013 +0100
ChpxFkp: properly transform non-compressed strings
diff --git a/src/docrecord.py b/src/docrecord.py
index 418d255..52dcfd4 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -30,16 +30,25 @@ class FcCompressed(DOCDirStream):
self.printAndSet("r1", self.r1)
print '</fcCompressed>'
- def getTransformedValue(self, start, end, double = True):
+ def getTransformedValue(self, start, end, full = True):
if self.fCompressed:
offset = self.fc/2
- return globals.encodeName(self.mainStream.bytes[offset:offset+end-start])
+ if full:
+ fro = offset
+ to = offset+end-start
+ else:
+ fro = start
+ to = end
+ return globals.encodeName(self.mainStream.bytes[fro:to])
else:
- l = end - start
- if double:
- l = l * 2
- offset = self.fc
- return globals.encodeName(self.mainStream.bytes[offset:offset+l].decode('utf-16'), lowOnly = True)
+ if full:
+ offset = self.fc
+ fro = offset
+ to = offset + (end - start) * 2
+ else:
+ fro = start
+ to = end
+ return globals.encodeName(self.mainStream.bytes[fro:to].decode('utf-16'), lowOnly = True)
@staticmethod
def getFCTransformedValue(bytes, start, end):
@@ -503,7 +512,7 @@ class ChpxFkp(DOCDirStream):
start = self.getuInt32(pos = pos)
end = self.getuInt32(pos = pos + 4)
print '<rgfc index="%d" start="%d" end="%d">' % (i, start, end)
- print '<transformed value="%s"/>' % FcCompressed.getFCTransformedValue(self.bytes, start, end)
+ print '<transformed value="%s"/>' % self.pnFkpChpx.mainStream.retrieveText(start, end)
pos += 4
# rgbx
diff --git a/src/docstream.py b/src/docstream.py
index f56e27e..5197c40 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -424,9 +424,9 @@ class WordDocumentStream(DOCDirStream):
def handleLcbClx(self, silent = False):
offset = self.fcClx
size = self.lcbClx
- clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size)
+ self.clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size)
if not silent:
- clx.dump()
+ self.clx.dump()
def handleLcbPlcfBteChpx(self):
plcBteChpx = docrecord.PlcBteChpx(self)
@@ -624,6 +624,6 @@ class WordDocumentStream(DOCDirStream):
def retrieveText(self, start, end):
plcPcd = self.clx.pcdt.plcPcd
idx = self.__findText(plcPcd, start)
- return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, double = False)
+ return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, full = False)
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
commit 09f36ce62d691c288795cb35026a3c92cd8df429
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat Jan 5 22:28:44 2013 +0100
PlcPcd: separate parse and dump
diff --git a/src/docrecord.py b/src/docrecord.py
index 25650d6..418d255 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -168,24 +168,28 @@ class PlcPcd(DOCDirStream, PLC):
self.size = size
self.aCp = []
self.aPcd = []
+ self.ranges = []
- def dump(self):
- print '<plcPcd type="PlcPcd" offset="%d" size="%d bytes">' % (self.pos, self.size)
pos = self.pos
for i in range(self.getElements()):
# aCp
start = self.getuInt32(pos = pos)
end = self.getuInt32(pos = pos + 4)
- print '<aCP index="%d" start="%d" end="%d">' % (i, start, end)
+ self.ranges.append((start, end))
self.aCp.append(start)
pos += 4
# aPcd
aPcd = Pcd(self.bytes, self.mainStream, self.getOffset(self.pos, i), 8)
- aPcd.dump()
self.aPcd.append(aPcd)
- print '<transformed value="%s"/>' % aPcd.fc.getTransformedValue(start, end)
+ def dump(self):
+ print '<plcPcd type="PlcPcd" offset="%d" size="%d bytes">' % (self.pos, self.size)
+ for i in range(self.getElements()):
+ start, end = self.ranges[i]
+ print '<aCP index="%d" start="%d" end="%d">' % (i, start, end)
+ self.aPcd[i].dump()
+ print '<transformed value="%s"/>' % self.aPcd[i].fc.getTransformedValue(start, end)
print '</aCP>'
print '</plcPcd>'
@@ -703,11 +707,15 @@ class Pcdt(DOCDirStream):
self.pos = offset
self.size = size
+ self.clxt = self.readuInt8()
+ self.lcb = self.readuInt32()
+ self.plcPcd = PlcPcd(self.bytes, self.mainStream, self.pos, self.lcb)
+
def dump(self):
print '<pcdt type="Pcdt" offset="%d" size="%d bytes">' % (self.pos, self.size)
- self.printAndSet("clxt", self.readuInt8())
- self.printAndSet("lcb", self.readuInt32())
- PlcPcd(self.bytes, self.mainStream, self.pos, self.lcb).dump()
+ self.printAndSet("clxt", self.clxt)
+ self.printAndSet("lcb", self.lcb)
+ self.plcPcd.dump()
print '</pcdt>'
class Clx(DOCDirStream):
commit 0a6ef67326a23e6c5c13f945130e118d8ffaf3f2
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat Jan 5 22:20:59 2013 +0100
PlcPcd: store aCp and aPcd array
diff --git a/src/docrecord.py b/src/docrecord.py
index ef32730..25650d6 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -166,6 +166,8 @@ class PlcPcd(DOCDirStream, PLC):
PLC.__init__(self, size, 8) # 8 is defined by 2.8.35
self.pos = offset
self.size = size
+ self.aCp = []
+ self.aPcd = []
def dump(self):
print '<plcPcd type="PlcPcd" offset="%d" size="%d bytes">' % (self.pos, self.size)
@@ -175,11 +177,13 @@ class PlcPcd(DOCDirStream, PLC):
start = self.getuInt32(pos = pos)
end = self.getuInt32(pos = pos + 4)
print '<aCP index="%d" start="%d" end="%d">' % (i, start, end)
+ self.aCp.append(start)
pos += 4
# aPcd
aPcd = Pcd(self.bytes, self.mainStream, self.getOffset(self.pos, i), 8)
aPcd.dump()
+ self.aPcd.append(aPcd)
print '<transformed value="%s"/>' % aPcd.fc.getTransformedValue(start, end)
print '</aCP>'
commit d35016594f20bf86f4b39b2dcdbcc6820a397ef7
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat Jan 5 22:16:42 2013 +0100
WordDocumentStream: add a retrieveText method
diff --git a/src/docrecord.py b/src/docrecord.py
index 770b956..ef32730 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -30,13 +30,16 @@ class FcCompressed(DOCDirStream):
self.printAndSet("r1", self.r1)
print '</fcCompressed>'
- def getTransformedValue(self, start, end):
- if self.fCompressed:
- offset = self.fc/2
- return globals.encodeName(self.mainStream.bytes[offset:offset+end-start])
- else:
- offset = self.fc
- return globals.encodeName(self.mainStream.bytes[offset:offset+end*2-start].decode('utf-16'), lowOnly = True)
+ def getTransformedValue(self, start, end, double = True):
+ if self.fCompressed:
+ offset = self.fc/2
+ return globals.encodeName(self.mainStream.bytes[offset:offset+end-start])
+ else:
+ l = end - start
+ if double:
+ l = l * 2
+ offset = self.fc
+ return globals.encodeName(self.mainStream.bytes[offset:offset+l].decode('utf-16'), lowOnly = True)
@staticmethod
def getFCTransformedValue(bytes, start, end):
diff --git a/src/docstream.py b/src/docstream.py
index f33fcf4..f56e27e 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -614,4 +614,16 @@ class WordDocumentStream(DOCDirStream):
self.__dumpFibRgFcLcb2002()
print '</%s>' % name
+ def __findText(self, plcPcd, cp):
+ """Find the largest i such that plcPcd.aCp[i] <= cp."""
+ for i in range(len(plcPcd.aCp)):
+ if plcPcd.aCp[i] <= cp:
+ index = i
+ return index
+
+ def retrieveText(self, start, end):
+ plcPcd = self.clx.pcdt.plcPcd
+ idx = self.__findText(plcPcd, start)
+ return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, double = False)
+
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
commit e86350962d0e874edf292e6b86fa35cf06aacd97
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat Jan 5 22:10:37 2013 +0100
parse Clx early
diff --git a/src/docstream.py b/src/docstream.py
index 7671bf8..f33fcf4 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -390,6 +390,18 @@ class WordDocumentStream(DOCDirStream):
["fcSttbfUssr"],
["lcbSttbfUssr"],
]
+
+ # Parse Clx early, as it's needed by other structures.
+ posOrig = self.pos
+ for i in fields:
+ value = self.readInt32()
+ if i[0] == "fcClx":
+ self.printAndSet(i[0], value, silent = True)
+ if i[0] == "lcbClx":
+ self.printAndSet(i[0], value, silent = True)
+ i[1](silent = True)
+ self.pos = posOrig
+
for i in fields:
value = self.readInt32()
hasHandler = len(i) > 1
@@ -409,11 +421,12 @@ class WordDocumentStream(DOCDirStream):
def handleDop(self):
docrecord.Dop(self).dump()
- def handleLcbClx(self):
+ def handleLcbClx(self, silent = False):
offset = self.fcClx
size = self.lcbClx
clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size)
- clx.dump()
+ if not silent:
+ clx.dump()
def handleLcbPlcfBteChpx(self):
plcBteChpx = docrecord.PlcBteChpx(self)
commit f8902aa146f25d1f4aec76d59d207ab68740cf6b
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat Jan 5 22:07:20 2013 +0100
Clx: decouple parsing from dumping
diff --git a/src/docrecord.py b/src/docrecord.py
index 85a967f..770b956 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -709,12 +709,15 @@ class Clx(DOCDirStream):
self.pos = offset
self.size = size
+ self.firstByte = self.getuInt8()
+ if self.firstByte == 0x02:
+ self.pcdt = Pcdt(self.bytes, self.mainStream, self.pos, self.size)
+
def dump(self):
print '<clx type="Clx" offset="%d" size="%d bytes">' % (self.pos, self.size)
- firstByte = self.getuInt8()
- if firstByte == 0x02:
+ if self.firstByte == 0x02:
print '<info what="Array of Prc, 0 elements"/>'
- Pcdt(self.bytes, self.mainStream, self.pos, self.size).dump()
+ self.pcdt.dump()
else:
print '<todo what="Clx::dump() first byte is not 0x02"/>'
print '</clx>'
commit 3a42fcbf8610c2575252e33adfe4132863192970
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat Jan 5 22:05:18 2013 +0100
Pcd: decouple parsing from dumping
diff --git a/src/docrecord.py b/src/docrecord.py
index 4efc546..85a967f 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -49,16 +49,21 @@ class Pcd(DOCDirStream):
self.pos = offset
self.size = size
- def dump(self):
- print '<pcd type="Pcd" offset="%d" size="%d bytes">' % (self.pos, self.size)
buf = self.readuInt16()
- self.printAndSet("fNoParaLast", self.getBit(buf, 0))
- self.printAndSet("fR1", self.getBit(buf, 1))
- self.printAndSet("fDirty", self.getBit(buf, 2))
- self.printAndSet("fR2", buf & (2**13-1))
+ self.fNoParaLast = self.getBit(buf, 0)
+ self.fR1 = self.getBit(buf, 1)
+ self.fDirty = self.getBit(buf, 2)
+ self.fR2 = buf & (2**13-1)
self.fc = FcCompressed(self.bytes, self.mainStream, self.pos, 4)
- self.fc.dump()
self.pos += 4
+
+ def dump(self):
+ print '<pcd type="Pcd" offset="%d" size="%d bytes">' % (self.pos, self.size)
+ self.printAndSet("fNoParaLast", self.fNoParaLast)
+ self.printAndSet("fR1", self.fR1)
+ self.printAndSet("fDirty", self.fDirty)
+ self.printAndSet("fR2", self.fR2)
+ self.fc.dump()
print '</pcd>'
class PLC:
commit 9e5fe43c776fadee01da3f1c7174c16dd71cb0fb
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat Jan 5 21:56:53 2013 +0100
FcCompressed: decouple parsing from dumping
diff --git a/src/docrecord.py b/src/docrecord.py
index 82005b2..4efc546 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -18,12 +18,16 @@ class FcCompressed(DOCDirStream):
self.pos = offset
self.size = size
+ buf = self.readuInt32()
+ self.fc = buf & ((2**32-1) >> 2) # bits 0..29
+ self.fCompressed = self.getBit(buf, 30)
+ self.r1 = self.getBit(buf, 31)
+
def dump(self):
print '<fcCompressed type="FcCompressed" offset="%d" size="%d bytes">' % (self.pos, self.size)
- buf = self.readuInt32()
- self.printAndSet("fc", buf & ((2**32-1) >> 2)) # bits 0..29
- self.printAndSet("fCompressed", self.getBit(buf, 30))
- self.printAndSet("r1", self.getBit(buf, 31))
+ self.printAndSet("fc", self.fc)
+ self.printAndSet("fCompressed", self.fCompressed)
+ self.printAndSet("r1", self.r1)
print '</fcCompressed>'
def getTransformedValue(self, start, end):
commit aab6a67b4032cee6401fa206c84b8bb98bdc8f98
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat Jan 5 20:56:27 2013 +0100
pass reference to parent in handleLcbPlcfBteChpx, PlcBteChpx and PnFkpChpx
diff --git a/src/docrecord.py b/src/docrecord.py
index fc041e5..82005b2 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -468,10 +468,11 @@ class BxPap(DOCDirStream):
class ChpxFkp(DOCDirStream):
"""The ChpxFkp structure maps text to its character properties."""
- def __init__(self, bytes, mainStream, offset, size):
- DOCDirStream.__init__(self, mainStream.bytes)
+ def __init__(self, pnFkpChpx, offset, size):
+ DOCDirStream.__init__(self, pnFkpChpx.mainStream.bytes)
self.pos = offset
self.size = size
+ self.pnFkpChpx = pnFkpChpx
def dump(self):
print '<chpxFkp type="ChpxFkp" offset="%d" size="%d bytes">' % (self.pos, self.size)
@@ -525,17 +526,18 @@ class PapxFkp(DOCDirStream):
class PnFkpChpx(DOCDirStream):
"""The PnFkpChpx structure specifies the location in the WordDocument Stream of a ChpxFkp structure."""
- def __init__(self, bytes, mainStream, offset, size, name):
- DOCDirStream.__init__(self, bytes, mainStream=mainStream)
+ def __init__(self, plcBteChpx, offset, size, name):
+ DOCDirStream.__init__(self, plcBteChpx.bytes, mainStream=plcBteChpx.mainStream)
self.pos = offset
self.size = size
self.name = name
+ self.plcBteChpx = plcBteChpx
def dump(self):
print '<%s type="PnFkpChpx" offset="%d" size="%d bytes">' % (self.name, self.pos, self.size)
buf = self.readuInt32()
self.printAndSet("pn", buf & (2**22-1))
- chpxFkp = ChpxFkp(self.bytes, self.mainStream, self.pn*512, 512)
+ chpxFkp = ChpxFkp(self, self.pn*512, 512)
chpxFkp.dump()
print '</%s>' % self.name
@@ -587,11 +589,11 @@ class PnFkpPapx(DOCDirStream):
class PlcBteChpx(DOCDirStream, PLC):
"""The PlcBteChpx structure is a PLC that maps the offsets of text in the WordDocument stream to the character properties of that text."""
- def __init__(self, bytes, mainStream, offset, size):
- DOCDirStream.__init__(self, bytes, mainStream=mainStream)
- PLC.__init__(self, size, 4)
- self.pos = offset
- self.size = size
+ def __init__(self, mainStream):
+ DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream)
+ PLC.__init__(self, mainStream.lcbPlcfBteChpx, 4)
+ self.pos = mainStream.fcPlcfBteChpx
+ self.size = mainStream.lcbPlcfBteChpx
def dump(self):
print '<plcBteChpx type="PlcBteChpx" offset="%d" size="%d bytes">' % (self.pos, self.size)
@@ -604,7 +606,7 @@ class PlcBteChpx(DOCDirStream, PLC):
pos += 4
# aPnBteChpx
- aPnBteChpx = PnFkpChpx(self.bytes, self.mainStream, self.getOffset(self.pos, i), 4, "aPnBteChpx")
+ aPnBteChpx = PnFkpChpx(self, self.getOffset(self.pos, i), 4, "aPnBteChpx")
aPnBteChpx.dump()
print '</aFC>'
print '</plcBteChpx>'
diff --git a/src/docstream.py b/src/docstream.py
index 5d4f999..7671bf8 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -416,9 +416,7 @@ class WordDocumentStream(DOCDirStream):
clx.dump()
def handleLcbPlcfBteChpx(self):
- offset = self.fcPlcfBteChpx
- size = self.lcbPlcfBteChpx
- plcBteChpx = docrecord.PlcBteChpx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size)
+ plcBteChpx = docrecord.PlcBteChpx(self)
plcBteChpx.dump()
def handleLcbPlcfBtePapx(self):
More information about the Libreoffice-commits
mailing list