[Libreoffice-commits] .: 3 commits - src/docrecord.py src/docstream.py test/doc
Miklos Vajna
vmiklos at kemper.freedesktop.org
Sat May 4 07:06:58 PDT 2013
src/docrecord.py | 7 +++----
src/docstream.py | 26 ++++++++++++++++++++++++++
test/doc/header.doc |binary
test/doc/header.rtf | 4 ++++
test/doc/headerlo.doc |binary
test/doc/test.py | 12 ++++++++++++
6 files changed, 45 insertions(+), 4 deletions(-)
New commits:
commit 0ccbaa56a706ec398e4cea1a656ced09829f1f8f
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat May 4 16:02:55 2013 +0200
PlcfSed: use retrieveCPs
diff --git a/src/docrecord.py b/src/docrecord.py
index bcd156b..20c63b4 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -300,7 +300,6 @@ class PlcfSed(DOCDirStream, PLC):
def dump(self):
print '<plcfSed type="PlcfSed" offset="%d" size="%d bytes">' % (self.pos, self.size)
- offset = self.mainStream.fcMin
pos = self.pos
for i in range(self.getElements()):
# aCp
@@ -313,7 +312,7 @@ class PlcfSed(DOCDirStream, PLC):
aSed = Sed(self, self.getOffset(self.pos, i))
aSed.dump()
- print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveText(offset + start, offset + end, logicalLength = True))
+ print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveCPs(start, end))
print '</aCP>'
print '</plcfSed>'
commit d8165133205a3c3533211b8e4f0da14de450bdaa
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat May 4 15:57:43 2013 +0200
doc: test header dumping of both MSO and LO-generated doc
diff --git a/test/doc/header.doc b/test/doc/header.doc
new file mode 100755
index 0000000..a351d72
Binary files /dev/null and b/test/doc/header.doc differ
diff --git a/test/doc/header.rtf b/test/doc/header.rtf
new file mode 100644
index 0000000..b3fd8b4
--- /dev/null
+++ b/test/doc/header.rtf
@@ -0,0 +1,4 @@
+{\rtf1
+{\header This is a header.}
+Hello world!\par
+}
diff --git a/test/doc/headerlo.doc b/test/doc/headerlo.doc
new file mode 100644
index 0000000..73ec689
Binary files /dev/null and b/test/doc/headerlo.doc differ
diff --git a/test/doc/test.py b/test/doc/test.py
index d02f680..e50e447 100755
--- a/test/doc/test.py
+++ b/test/doc/test.py
@@ -170,6 +170,18 @@ class Test(unittest.TestCase):
levels = self.root.findall('stream[@name="WordDocument"]/fib/fibRgFcLcbBlob/lcbPlfLst/plfLst/lvl')
self.assertEqual("â¢", levels[0].findall('xst/rgtchar')[0].attrib['value'])
+ def test_header(self):
+ self.dump('header')
+
+ firstHeader = self.root.findall('stream[@name="WordDocument"]/fib/fibRgFcLcbBlob/lcbPlcfHdd/plcfHdd/aCP[@index="7"]')
+ self.assertEqual("This is a header.\\x0D\\x0D", firstHeader[0].findall('transformed')[0].attrib['value'])
+
+ def test_headerlo(self):
+ self.dump('headerlo')
+
+ firstHeader = self.root.findall('stream[@name="WordDocument"]/fib/fibRgFcLcbBlob/lcbPlcfHdd/plcfHdd/aCP[@index="7"]')
+ self.assertEqual("This is a header.\\x0D\\x0D", firstHeader[0].findall('transformed')[0].attrib['value'])
+
if __name__ == '__main__':
unittest.main()
commit 89cb847a68c1db6a1177d9a0673a68702de48da2
Author: Miklos Vajna <vmiklos at suse.cz>
Date: Sat May 4 15:50:25 2013 +0200
doc: fix dumping header/footer text in LO-produced files
diff --git a/src/docrecord.py b/src/docrecord.py
index 62ca85f..bcd156b 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -816,7 +816,7 @@ class PlcfHdd(DOCDirStream, PLC):
def dump(self):
print '<plcfHdd type="PlcfHdd" offset="%d" size="%d bytes">' % (self.pos, self.size)
- offset = self.mainStream.fcMin + self.mainStream.ccpText
+ offset = self.mainStream.ccpText + self.mainStream.ccpFtn
pos = self.pos
for i in range(self.getElements() - 1):
start = self.getuInt32(pos = pos)
@@ -837,7 +837,7 @@ class PlcfHdd(DOCDirStream, PLC):
11: "First page footer",
}
print '<aCP index="%d" contents="%s" start="%d" end="%d">' % (i, contentsMap[i], start, end)
- print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveText(offset + start, offset + end))
+ print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveCPs(offset + start, offset + end))
pos += 4
print '</aCP>'
print '</plcfHdd>'
diff --git a/src/docstream.py b/src/docstream.py
index 7f3cc51..dc36baf 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -9,6 +9,7 @@ import ole
import struct
from docdirstream import DOCDirStream
import docrecord
+import globals
class DOCFile:
"""Represents the whole word file - feed will all bytes."""
@@ -674,8 +675,33 @@ class WordDocumentStream(DOCDirStream):
return index
def retrieveText(self, start, end, logicalLength = False):
+ """Deprecated, use retrieveCPs instead."""
plcPcd = self.clx.pcdt.plcPcd
idx = self.__findText(plcPcd, start)
return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, logicalPositions = False, logicalLength = logicalLength)
+ def retrieveCP(self, cp):
+ """Implements 2.4.1 Retrieving Text."""
+ plcPcd = self.clx.pcdt.plcPcd
+ for i in range(len(plcPcd.aCp)):
+ if plcPcd.aCp[i] <= cp:
+ index = i
+ break
+ aPcd = plcPcd.aPcd[index]
+ fcCompressed = aPcd.fc
+ if fcCompressed.fCompressed == 1:
+ return globals.encodeName(self.bytes[(fcCompressed.fc/2) + (cp - plcPcd.aCp[i])])
+ else:
+ pos = fcCompressed.fc + 2 * (cp - plcPcd.aCp[i])
+ return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16'), lowOnly = True)
+
+ def retrieveCPs(self, start, end):
+ """Retrieves a range of characters."""
+ ret = []
+ i = start
+ while i < end:
+ ret.append(self.retrieveCP(i))
+ i += 1
+ return "".join(ret)
+
# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
More information about the Libreoffice-commits
mailing list