[Libreoffice-commits] .: src/docrecord.py src/docstream.py

Miklos Vajna vmiklos at kemper.freedesktop.org
Sat May 4 11:20:47 PDT 2013


 src/docrecord.py |   25 ++++---------------------
 src/docstream.py |   23 ++++++++++++++++++-----
 2 files changed, 22 insertions(+), 26 deletions(-)

New commits:
commit 2c4e52467c405d35823a781e7745a1fe7ae42a30
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Sat May 4 20:09:32 2013 +0200

    retrieveText -> retrieveOffset
    
    So finally we have retrieveOffset to look up text based on raw byte
    offsets and retrieveCP to look up logical character positions.
    retrieveText tried to do both, without success.

diff --git a/src/docrecord.py b/src/docrecord.py
index b46d0b1..9fda05f 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -30,23 +30,6 @@ class FcCompressed(DOCDirStream):
         self.printAndSet("r1", self.r1)
         print '</fcCompressed>'
 
-    def getTransformedValue(self, start, end, logicalPositions = True, logicalLength = True):
-        offset = self.fc
-        if self.fCompressed:
-            offset = self.fc/2
-        if logicalPositions:
-            fro = offset + start
-            to = offset + end
-        else:
-            fro = start
-            to = end
-        if self.fCompressed:
-            return globals.encodeName(self.mainStream.bytes[fro:to])
-        else:
-            if logicalLength:
-                to += (to - fro)
-            return globals.encodeName(self.mainStream.bytes[fro:to].decode('utf-16'), lowOnly = True)
-
 class Pcd(DOCDirStream):
     """The Pcd structure specifies the location of text in the WordDocument Stream and additional properties for this text."""
     def __init__(self, bytes, mainStream, offset, size):
@@ -217,7 +200,7 @@ class PlcfBkl(DOCDirStream, PLC):
             end = offset + self.getuInt32(pos = pos)
             print '<aCP index="%d" bookmarkEnd="%d">' % (i, end)
             start = self.start.aCP[i]
-            print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveText(start, end))
+            print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveOffset(start, end))
             pos += 4
             print '</aCP>'
         print '</plcfBkl>'
@@ -252,7 +235,7 @@ class PlcPcd(DOCDirStream, PLC):
             start, end = self.ranges[i]
             print '<aCP index="%d" start="%d" end="%d">' % (i, start, end)
             self.aPcd[i].dump()
-            print '<transformed value="%s"/>' % self.quoteAttr(self.aPcd[i].fc.getTransformedValue(start, end))
+            print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveCPs(start, end))
             print '</aCP>'
         print '</plcPcd>'
 
@@ -675,7 +658,7 @@ class ChpxFkp(DOCDirStream):
             start = self.getuInt32(pos = pos)
             end = self.getuInt32(pos = pos + 4)
             print '<rgfc index="%d" start="%d" end="%d">' % (i, start, end)
-            print '<transformed value="%s"/>' % self.quoteAttr(self.pnFkpChpx.mainStream.retrieveText(start, end))
+            print '<transformed value="%s"/>' % self.quoteAttr(self.pnFkpChpx.mainStream.retrieveOffset(start, end))
             pos += 4
 
             # rgbx
@@ -704,7 +687,7 @@ class PapxFkp(DOCDirStream):
             start = self.getuInt32(pos = pos)
             end = self.getuInt32(pos = pos + 4)
             print '<rgfc index="%d" start="%d" end="%d">' % (i, start, end)
-            print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveText(start, end))
+            print '<transformed value="%s"/>' % self.quoteAttr(self.mainStream.retrieveOffset(start, end))
             pos += 4
 
             # rgbx
diff --git a/src/docstream.py b/src/docstream.py
index dec8cb6..d98e0ea 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -674,11 +674,25 @@ class WordDocumentStream(DOCDirStream):
                 index = i
         return index
 
-    def retrieveText(self, start, end):
-        """Deprecated, use retrieveCPs instead."""
+    def retrieveOffset(self, start, end):
+        """Retrieves text, defined by raw byte offsets."""
+
+        # Is the given offset compressed?
         plcPcd = self.clx.pcdt.plcPcd
-        idx = self.__findText(plcPcd, start)
-        return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, logicalPositions = False, logicalLength = False)
+        for i in range(len(plcPcd.aCp)):
+            aPcd = plcPcd.aPcd[i]
+            fcCompressed = aPcd.fc
+            if fcCompressed.fCompressed == 1:
+                offset = fcCompressed.fc/2
+            else:
+                offset = fcCompressed.fc
+            if offset <= start:
+                compressed = fcCompressed.fCompressed
+
+        if compressed:
+            return globals.encodeName(self.bytes[start:end])
+        else:
+            return globals.encodeName(self.bytes[start:end].decode('utf-16'), lowOnly = True)
 
     def retrieveCP(self, cp):
         """Implements 2.4.1 Retrieving Text."""
@@ -686,7 +700,6 @@ class WordDocumentStream(DOCDirStream):
         for i in range(len(plcPcd.aCp)):
             if plcPcd.aCp[i] <= cp:
                 index = i
-                break
         aPcd = plcPcd.aPcd[index]
         fcCompressed = aPcd.fc
         if fcCompressed.fCompressed == 1:


More information about the Libreoffice-commits mailing list