[Libreoffice-commits] mso-dumper.git: src/docstream.py

Miklos Vajna vmiklos at kemper.freedesktop.org
Wed May 22 02:35:51 PDT 2013


 src/docstream.py |   69 +++++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

New commits:
commit 5e13b5a103883762c9047ce2f5c49417be4690f6
Author: Miklos Vajna <vmiklos at suse.cz>
Date:   Wed May 22 10:10:58 2013 +0200

    doc: don't try to bytes as text outside Clx
    
    When e.g. ChpxFkp refers to byte ranges defined by offsets, we try to be
    helpful and print the matching text, to see a given character property
    is applied to what characters. However, it's possible that these offets
    refer to picture data -- so only show these bytes as text in case they
    are referred as text in Clx. fdo34222-4.doc is a reproducer for this
    problem.

diff --git a/src/docstream.py b/src/docstream.py
index ee196ce..37db294 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -895,44 +895,35 @@ class WordDocumentStream(DOCDirStream):
         for i in fields:
             self.printAndSet(i, self.readuInt32())
 
-    def __findText(self, plcPcd, cp):
-        """Find the largest i such that plcPcd.aCp[i] <= cp."""
-        for i in range(len(plcPcd.aCp)):
-            if plcPcd.aCp[i] <= cp:
-                index = i
-        return index
-
-    def __isOffsetCompressed(self, off):
-        """Is the given offset compressed?"""
-        compressed = None
-        plcPcd = self.clx.pcdt.plcPcd
-        for i in range(len(plcPcd.aCp)):
-            aPcd = plcPcd.aPcd[i]
-            fcCompressed = aPcd.fc
-            if fcCompressed.fCompressed == 1:
-                offset = fcCompressed.fc/2
-            else:
-                offset = fcCompressed.fc
-            if offset <= off:
-                compressed = fcCompressed.fCompressed
-        return compressed
-
     def retrieveOffset(self, start, end):
         """Retrieves text, defined by raw byte offsets."""
 
-        compressed = self.__isOffsetCompressed(start)
-        if compressed == None:
-            compressed = self.__isOffsetCompressed(end)
+        startCp = self.__offsetToCP(start)
+        endCp = self.__offsetToCP(end)
+        if startCp is None or endCp is None:
+            return ""
+        return self.retrieveCPs(startCp, endCp)
 
-        if compressed == None:
-            raise Exception("should not happen")
-
-        if compressed:
-            return globals.encodeName(self.bytes[start:end])
-        else:
-            return globals.encodeName(self.bytes[start:end].decode('utf-16'), lowOnly = True)
-
-    def retrieveCP(self, cp):
+    def __offsetToCP(self, offset):
+        plcPcd = self.clx.pcdt.plcPcd
+        for i in range(len(plcPcd.ranges)):
+            start, end = plcPcd.ranges[i]
+            # Count offset of the last-but-one CP, the last CP is in fact not included in the range.
+            end -= 1
+            startOffset, compressed = self.__cpToOffset(start)
+            endOffset = self.__cpToOffset(end)[0]
+            if compressed:
+                endOffset += 1
+            else:
+                endOffset += 2
+            if offset >= startOffset and offset <= endOffset:
+                if compressed:
+                    divider = 1
+                else:
+                    divider = 2
+                return (start + ((offset - startOffset) / divider))
+
+    def __cpToOffset(self, cp):
         """Implements 2.4.1 Retrieving Text."""
         plcPcd = self.clx.pcdt.plcPcd
         for i in range(len(plcPcd.aCp)):
@@ -941,9 +932,17 @@ class WordDocumentStream(DOCDirStream):
         aPcd = plcPcd.aPcd[index]
         fcCompressed = aPcd.fc
         if fcCompressed.fCompressed == 1:
-            return globals.encodeName(self.bytes[(fcCompressed.fc/2) + (cp - plcPcd.aCp[index])])
+            pos = (fcCompressed.fc/2) + (cp - plcPcd.aCp[index])
+            return pos, True
         else:
             pos = fcCompressed.fc + 2 * (cp - plcPcd.aCp[index])
+            return pos, False
+
+    def retrieveCP(self, cp):
+        pos, compressed = self.__cpToOffset(cp)
+        if compressed:
+            return globals.encodeName(self.bytes[pos])
+        else:
             return globals.encodeName(self.bytes[pos:pos+2].decode('utf-16'), lowOnly = True)
 
     def retrieveCPs(self, start, end):


More information about the Libreoffice-commits mailing list