[poppler] glib/poppler-page.cc

Carlos Garcia Campos carlosgc at kemper.freedesktop.org
Sat Nov 24 04:42:45 PST 2012


 glib/poppler-page.cc |   62 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 27 deletions(-)

New commits:
commit b0297110c455eb18096268b59d6095d428380de5
Author: Jason Crain <jason at aquaticape.us>
Date:   Wed Nov 21 14:15:59 2012 -0600

    glib: check if words end with spaces
    
    poppler_page_get_text_layout and poppler_page_get_text_attributes
    assume that each word ends with a space or newline, causing them to
    become mismatched from the text.  This adds a check to
    TextWord::getSpaceAfter.
    
    https://bugs.freedesktop.org/show_bug.cgi?id=54504

diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index b362a62..52dab5f 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -1980,7 +1980,9 @@ poppler_page_get_text_layout (PopplerPage       *page,
   TextWordList *wordlist;
   TextWord *word, *nextword;
   PopplerRectangle *rect;
-  int i, j, offset = 0;
+  int i, j;
+  guint offset = 0;
+  guint n_rects = 0;
   gdouble x1, y1, x2, y2;
   gdouble x3, y3, x4, y4;
 
@@ -2001,10 +2003,14 @@ poppler_page_get_text_layout (PopplerPage       *page,
   for (i = 0; i < wordlist->getLength (); i++)
     {
       word = wordlist->get (i);
-      *n_rectangles += word->getLength () + 1;
+      n_rects += word->getLength ();
+      if (!word->getNext () || word->getSpaceAfter ())
+	n_rects++;
     }
+  n_rects--;
 
-  *rectangles = g_new (PopplerRectangle, *n_rectangles);
+  *n_rectangles = n_rects;
+  *rectangles = g_new (PopplerRectangle, n_rects);
 
   // Calculating each char position
   for (i = 0; i < wordlist->getLength (); i++)
@@ -2028,23 +2034,27 @@ poppler_page_get_text_layout (PopplerPage       *page,
       nextword = word->getNext ();
       if (nextword)
         {
-	  nextword->getBBox (&x3, &y3, &x4, &y4);
-	  // space is from one word to other and with the same height as
-	  // first word.
-	  rect->x1 = x2;
-	  rect->y1 = y1;
-	  rect->x2 = x3;
-	  rect->y2 = y2;
-	}
-      else
+	  if (word->getSpaceAfter ())
+	    {
+	      nextword->getBBox (&x3, &y3, &x4, &y4);
+	      // space is from one word to other and with the same height as
+	      // first word.
+	      rect->x1 = x2;
+	      rect->y1 = y1;
+	      rect->x2 = x3;
+	      rect->y2 = y2;
+	      offset++;
+	    }
+	  }
+      else if (offset < n_rects)
         {
 	  // end of line
 	  rect->x1 = x2;
 	  rect->y1 = y2;
 	  rect->x2 = x2;
 	  rect->y2 = y2;
+	  offset++;
 	}
-      offset++;
     }
 
   delete wordlist;
@@ -2112,7 +2122,6 @@ poppler_page_get_text_attributes (PopplerPage *page)
   TextPage *text;
   TextWordList *wordlist;
   PopplerTextAttributes *attrs = NULL;
-  PopplerTextAttributes *previous = NULL;
   gint i, offset = 0;
   GList *attributes = NULL;
 
@@ -2137,23 +2146,22 @@ poppler_page_get_text_attributes (PopplerPage *page)
 
       for (word_i = 0; word_i < word->getLength (); word_i++)
 	{
-	  if (prev_word && word_text_attributes_equal (word, word_i, prev_word, prev_word_i)) {
-	    attrs = previous;
-	  } else {
-	    attrs = poppler_text_attributes_new_from_word (word, word_i);
-	    attrs->start_index = offset;
-	    if (previous)
-	      previous->end_index--;
-	    previous = attrs;
-	    attributes = g_list_prepend (attributes, attrs);
-	  }
-	  offset++;
+	  if (!prev_word || !word_text_attributes_equal (word, word_i, prev_word, prev_word_i))
+            {
+              attrs = poppler_text_attributes_new_from_word (word, word_i);
+              attrs->start_index = offset;
+              attributes = g_list_prepend (attributes, attrs);
+            }
 	  attrs->end_index = offset;
+	  offset++;
 	  prev_word = word;
 	  prev_word_i = word_i;
 	}
-      offset++;
-      attrs->end_index = offset;
+      if (!word->getNext () || word->getSpaceAfter ())
+        {
+          attrs->end_index = offset;
+          offset++;
+        }
     }
   if (attrs)
     attrs->end_index--;


More information about the poppler mailing list