[poppler] 4 commits - glib/poppler-page.cc poppler/TextOutputDev.cc poppler/TextOutputDev.h

Carlos Garcia Campos carlosgc at kemper.freedesktop.org
Tue Jun 25 02:48:00 PDT 2013


 glib/poppler-page.cc     |  168 ++++++++++++++++--------------
 poppler/TextOutputDev.cc |  261 +++++++++++++++++++++++++----------------------
 poppler/TextOutputDev.h  |    4 
 3 files changed, 235 insertions(+), 198 deletions(-)

New commits:
commit c55b577ce69ad4bb69f5261b3e120e92c9fdb3d0
Author: Carlos Garcia Campos <carlosgc at gnome.org>
Date:   Tue Jun 25 10:01:38 2013 +0200

    glib: Use TextPage::getSelectionWords to build text layout and attributes
    
    This way we can make sure that the list of words used in
    poppler_page_get_text_layout and poppler_page_get_text_attributes is the
    same that the one used in poppler_page_get_text. This fixes the mismatch
    between the number of characters in the text returned by
    poppler_page_get_text and the number of characters returned by
    poppler_page_get_text_layout in some documents.

diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 631edb5..9115b78 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -1979,66 +1979,66 @@ poppler_page_get_text_layout (PopplerPage       *page,
                               guint             *n_rectangles)
 {
   TextPage *text;
-  TextWordList *wordlist;
-  TextWord *word, *nextword;
   PopplerRectangle *rect;
-  int i, j;
+  PDFRectangle selection;
+  int i, j, k;
   guint offset = 0;
   guint n_rects = 0;
   gdouble x1, y1, x2, y2;
   gdouble x3, y3, x4, y4;
+  GooList **word_list;
+  int n_lines;
 
   g_return_val_if_fail (POPPLER_IS_PAGE (page), FALSE);
 
   *n_rectangles = 0;
 
+  poppler_page_get_size (page, &selection.x2, &selection.y2);
   text = poppler_page_get_text_page (page);
-  wordlist = text->makeWordList (gFalse);
+  word_list = text->getSelectionWords (&selection, selectionStyleGlyph, &n_lines);
+  if (!word_list)
+          return FALSE;
 
-  if (wordlist->getLength () <= 0)
+  n_rects += n_lines - 1;
+  for (i = 0; i < n_lines; i++)
     {
-      delete wordlist;
-      return FALSE;
-    }
-
-  // Getting the array size
-  for (i = 0; i < wordlist->getLength (); i++)
-    {
-      word = wordlist->get (i);
-      n_rects += word->getLength ();
-      if (!word->getNext () || word->getSpaceAfter ())
-	n_rects++;
+      GooList *line_words = word_list[i];
+      n_rects += line_words->getLength() - 1;
+      for (j = 0; j < line_words->getLength(); j++)
+        {
+          TextWord *word = (TextWord *)line_words->get(j);
+          n_rects += word->getLength();
+        }
     }
-  n_rects--;
 
-  *n_rectangles = n_rects;
   *rectangles = g_new (PopplerRectangle, n_rects);
+  *n_rectangles = n_rects;
 
-  // Calculating each char position
-  for (i = 0; i < wordlist->getLength (); i++)
+  for (i = 0; i < n_lines; i++)
     {
-      word = wordlist->get (i);
-      for (j = 0; j < word->getLength (); j++)
+      GooList *line_words = word_list[i];
+      for (j = 0; j < line_words->getLength(); j++)
         {
+          TextWord *word = (TextWord *)line_words->get(j);
+          for (k = 0; k < word->getLength(); k++)
+            {
+              rect = *rectangles + offset;
+              word->getCharBBox (k,
+                                 &(rect->x1),
+                                 &(rect->y1),
+                                 &(rect->x2),
+                                 &(rect->y2));
+              offset++;
+            }
+
           rect = *rectangles + offset;
-	  word->getCharBBox (j,
-			     &(rect->x1),
-			     &(rect->y1),
-			     &(rect->x2),
-			     &(rect->y2));
-	  offset++;
-	}
+          word->getBBox (&x1, &y1, &x2, &y2);
 
-      // adding spaces and break lines
-      rect = *rectangles + offset;
-      word->getBBox (&x1, &y1, &x2, &y2);
+          if (j < line_words->getLength() - 1)
+            {
+              TextWord *next_word = (TextWord *)line_words->get(j + 1);
 
-      nextword = word->getNext ();
-      if (nextword)
-        {
-	  if (word->getSpaceAfter ())
-	    {
-	      nextword->getBBox (&x3, &y3, &x4, &y4);
+              next_word->getBBox(&x3, &y3, &x4, &y4);
 	      // space is from one word to other and with the same height as
 	      // first word.
 	      rect->x1 = x2;
@@ -2046,20 +2046,23 @@ poppler_page_get_text_layout (PopplerPage       *page,
 	      rect->x2 = x3;
 	      rect->y2 = y2;
 	      offset++;
-	    }
-	  }
-      else if (offset < n_rects)
+            }
+        }
+
+      if (i < n_lines - 1 && offset > 0)
         {
-	  // end of line
-	  rect->x1 = x2;
-	  rect->y1 = y2;
-	  rect->x2 = x2;
-	  rect->y2 = y2;
-	  offset++;
-	}
+          // end of line
+          rect->x1 = x2;
+          rect->y1 = y2;
+          rect->x2 = x2;
+          rect->y2 = y2;
+          offset++;
+        }
+
+      delete line_words;
     }
 
-  delete wordlist;
+  gfree (word_list);
 
   return TRUE;
 }
@@ -2122,53 +2125,62 @@ GList *
 poppler_page_get_text_attributes (PopplerPage *page)
 {
   TextPage *text;
-  TextWordList *wordlist;
+  PDFRectangle selection;
+  GooList **word_list;
+  int n_lines;
   PopplerTextAttributes *attrs = NULL;
-  gint i, offset = 0;
+  TextWord *word, *prev_word = NULL;
+  gint word_i, prev_word_i;
+  gint i, j;
+  gint offset = 0;
   GList *attributes = NULL;
 
   g_return_val_if_fail (POPPLER_IS_PAGE (page), NULL);
 
+  poppler_page_get_size (page, &selection.x2, &selection.y2);
   text = poppler_page_get_text_page (page);
-  wordlist = text->makeWordList (gFalse);
+  word_list = text->getSelectionWords (&selection, selectionStyleGlyph, &n_lines);
+  if (!word_list)
+          return NULL;
 
-  if (wordlist->getLength () <= 0)
+  for (i = 0; i < n_lines; i++)
     {
-      delete wordlist;
-      return NULL;
-    }
-
-  TextWord *word, *prev_word = NULL;
-  gint word_i, prev_word_i;
+      GooList *line_words = word_list[i];
+      for (j = 0; j < line_words->getLength(); j++)
+        {
+          word = (TextWord *)line_words->get(j);
 
-  // Calculating each word attributes
-  for (i = 0; i < wordlist->getLength (); i++)
-    {
-      word = wordlist->get (i);
+          for (word_i = 0; word_i < word->getLength (); word_i++)
+            {
+              if (!prev_word || !word_text_attributes_equal (word, word_i, prev_word, prev_word_i))
+                {
+                  attrs = poppler_text_attributes_new_from_word (word, word_i);
+                  attrs->start_index = offset;
+                  attributes = g_list_prepend (attributes, attrs);
+                }
+              attrs->end_index = offset;
+              offset++;
+              prev_word = word;
+              prev_word_i = word_i;
+            }
 
-      for (word_i = 0; word_i < word->getLength (); word_i++)
-	{
-	  if (!prev_word || !word_text_attributes_equal (word, word_i, prev_word, prev_word_i))
+          if (j < line_words->getLength() - 1)
             {
-              attrs = poppler_text_attributes_new_from_word (word, word_i);
-              attrs->start_index = offset;
-              attributes = g_list_prepend (attributes, attrs);
+              attrs->end_index = offset;
+              offset++;
             }
-	  attrs->end_index = offset;
-	  offset++;
-	  prev_word = word;
-	  prev_word_i = word_i;
-	}
-      if (!word->getNext () || word->getSpaceAfter ())
+        }
+
+      if (i < n_lines - 1)
         {
           attrs->end_index = offset;
           offset++;
         }
+
+      delete line_words;
     }
-  if (attrs)
-    attrs->end_index--;
 
-  delete wordlist;
+  gfree (word_list);
 
   return g_list_reverse(attributes);
 }
commit fc534f571315c064005515c19d7d70ad3af1563e
Author: Carlos Garcia Campos <carlosgc at gnome.org>
Date:   Tue Jun 25 10:05:01 2013 +0200

    TextOutputDev: add a method to TextPage to get the selection as a list of words
    
    Returns a list of lines of words.

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 4ef5963..928e95a 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -4043,6 +4043,7 @@ public:
   void endPage();
 
   GooString *getText(void);
+  GooList **getWordList(int *nLines);
 
 private:
 
@@ -4177,6 +4178,29 @@ GooString *TextSelectionDumper::getText (void)
   return text;
 }
 
+GooList **TextSelectionDumper::getWordList(int *nLinesOut)
+{
+  int i, j;
+
+  if (nLines == 0)
+    return NULL;
+
+  GooList **wordList = (GooList **)gmallocn(nLines, sizeof(GooList *));
+
+  for (i = 0; i < nLines; i++) {
+    GooList *lineWords = lines[i];
+    wordList[i] = new GooList();
+    for (j = 0; j < lineWords->getLength(); j++) {
+      TextWordSelection *sel = (TextWordSelection *)lineWords->get(j);
+      wordList[i]->append(sel->word);
+    }
+  }
+
+  *nLinesOut = nLines;
+
+  return wordList;
+}
+
 class TextSelectionSizer : public TextSelectionVisitor {
 public:
   TextSelectionSizer(TextPage *page, double scale);
@@ -4751,6 +4775,18 @@ GooString *TextPage::getSelectionText(PDFRectangle *selection,
   return dumper.getText();
 }
 
+GooList **TextPage::getSelectionWords(PDFRectangle *selection,
+                                      SelectionStyle style,
+                                      int *nLines)
+{
+  TextSelectionDumper dumper(this);
+
+  visitSelection(&dumper, selection, style);
+  dumper.endPage();
+
+  return dumper.getWordList(nLines);
+}
+
 GBool TextPage::findCharRange(int pos, int length,
 			      double *xMin, double *yMin,
 			      double *xMax, double *yMax) {
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 664f9d1..6269f1c 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -563,6 +563,10 @@ public:
   GooString *getSelectionText(PDFRectangle *selection,
 			      SelectionStyle style);
 
+  GooList **getSelectionWords(PDFRectangle *selection,
+                              SelectionStyle style,
+                              int *nLines);
+
   // Find a string by character position and length.  If found, sets
   // the text bounding rectangle and returns true; otherwise returns
   // false.
commit a924246b7534e86165f8e9ab6c60d56b73a17b94
Author: Carlos Garcia Campos <carlosgc at gnome.org>
Date:   Tue Jun 25 09:57:48 2013 +0200

    TextOutputDev: simplify the text selection dumper
    
    Build a list of lines of words and don't try to format the text when
    detecting tables, simply add the words and lines in the right order.

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 2872f02..4ef5963 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -4039,26 +4039,55 @@ public:
 			  int edge_end,
 			  PDFRectangle *selection);
   virtual void visitWord (TextWord *word, int begin, int end,
-			  PDFRectangle *selection) { };
+			  PDFRectangle *selection);
+  void endPage();
 
   GooString *getText(void);
 
 private:
-  TextLineFrag *frags;
-  int nFrags, fragsSize;
+
+  void startLine();
+  void finishLine();
+
+  GooList **lines;
+  int nLines, linesSize;
+  GooList *words;
+  int tableId;
+  TextBlock *currentBlock;
 };
 
 TextSelectionDumper::TextSelectionDumper(TextPage *page)
     : TextSelectionVisitor(page)
 {
-  fragsSize = 256;
-  frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
-  nFrags = 0;
+  linesSize = 256;
+  lines = (GooList **)gmallocn(linesSize, sizeof(GooList *));
+  nLines = 0;
+
+  tableId = -1;
+  currentBlock = NULL;
+  words = NULL;
 }
 
 TextSelectionDumper::~TextSelectionDumper()
 {
-  gfree(frags);
+  for (int i = 0; i < nLines; i++)
+    deleteGooList(lines[i], TextWordSelection);
+  gfree(lines);
+}
+
+void TextSelectionDumper::startLine()
+{
+  finishLine();
+  words = new GooList();
+}
+
+void TextSelectionDumper::finishLine()
+{
+  if (words && words->getLength() > 0)
+    lines[nLines++] = words;
+  else if (words)
+    delete words;
+  words = NULL;
 }
 
 void TextSelectionDumper::visitLine (TextLine *line,
@@ -4068,130 +4097,84 @@ void TextSelectionDumper::visitLine (TextLine *line,
 				     int edge_end,
 				     PDFRectangle *selection)
 {
-  if (nFrags == fragsSize) {
-    fragsSize *= 2;
-    frags = (TextLineFrag *) grealloc(frags, fragsSize * sizeof(TextLineFrag));
+  TextLineFrag frag;
+
+  if (nLines == linesSize) {
+    linesSize *= 2;
+    lines = (GooList **)grealloc(lines, linesSize * sizeof(GooList *));
+  }
+
+  frag.init(line, edge_begin, edge_end - edge_begin);
+
+  if (tableId >= 0 && frag.line->blk->tableId < 0) {
+    finishLine();
+
+    tableId = -1;
+    currentBlock = NULL;
   }
 
-  frags[nFrags].init(line, edge_begin, edge_end - edge_begin);
-  ++nFrags;
+  if (frag.line->blk->tableId >= 0) { // a table
+    if (tableId == -1) {
+      tableId = frag.line->blk->tableId;
+      currentBlock = frag.line->blk;
+    }
+
+    if (currentBlock == frag.line->blk) { // the same block
+      startLine();
+    } else { // another block
+      if (currentBlock->tableEnd) { // previous block ended its row
+        startLine();
+      }
+      currentBlock = frag.line->blk;
+    }
+  } else { // not a table
+    startLine();
+  }
+}
 
+void TextSelectionDumper::visitWord (TextWord *word, int begin, int end,
+                                     PDFRectangle *selection)
+{
+  words->append(new TextWordSelection(word, begin, end));
+}
+
+void TextSelectionDumper::endPage()
+{
+  finishLine();
 }
 
 GooString *TextSelectionDumper::getText (void)
 {
-  GooString *s;
-  TextLineFrag *frag;
+  GooString *text;
   int i, j;
   UnicodeMap *uMap;
   char space[8], eol[16];
   int spaceLen, eolLen;
-  GooList *strings = NULL;
-  int actual_table = -1;
-  int actual_line = -1;
-  int last_length = 0;
-  TextBlock *actual_block = NULL;
-
-  s = new GooString();
 
-  uMap = globalParams->getTextEncoding();
+  text = new GooString();
 
-  if (uMap == NULL)
-      return s;
+  if (!(uMap = globalParams->getTextEncoding()))
+    return text;
 
   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
   eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
 
-  if (nFrags > 0) {
-    for (i = 0; i < nFrags; ++i) {
-      frag = &frags[i];
-
-      if (actual_table >= 0 && frag->line->blk->tableId < 0) {
-        for (j = 0; j < strings->getLength (); j++) {
-          s->append ((GooString*) strings->get (j));
-          s->append (eol, eolLen);
-          delete ((GooString*) strings->get (j));
-        }
-        delete strings;
-        strings = NULL;
-        actual_table = -1;
-        actual_line = -1;
-        actual_block = NULL;
-      }
-
-      // a table
-      if (frag->line->blk->tableId >= 0) {
-        if (actual_table == -1) {
-          strings = new GooList();
-          actual_table = frag->line->blk->tableId;
-          actual_block = frag->line->blk;
-          actual_line = -1;
-        }
-
-        // the same block
-        if (actual_block == frag->line->blk) {
-          actual_line++;
-          if (actual_line >= strings->getLength ()) {
-            GooString *t = new GooString ();
-            // add some spaces to have this block correctly aligned
-            if (actual_line > 0)
-              for (j = 0; j < ((GooString*) (strings->get (actual_line - 1)))->getLength() - last_length - 1; j++)
-                t->append (space, spaceLen);
-            strings->append (t);
-          }
-        }
-        // another block
-        else {
-          // previous block ended its row
-          if (actual_block->tableEnd) {
-            for (j = 0; j < strings->getLength (); j++) {
-              s->append ((GooString*) strings->get (j));
-              s->append (eol, eolLen);
-              delete ((GooString*) strings->get (j));
-            }
-            delete strings;
-
-            strings = new GooList();
-            GooString *t = new GooString ();
-            strings->append (t);
-          }
-          actual_block = frag->line->blk;
-          actual_line = 0;
-        }
-
-        page->dumpFragment(frag->line->text + frag->start, frag->len, uMap, ((GooString*) strings->get (actual_line)));
-        last_length = frag->len;
+  for (i = 0; i < nLines; i++) {
+    GooList *lineWords = lines[i];
+    for (j = 0; j < lineWords->getLength(); j++) {
+      TextWordSelection *sel = (TextWordSelection *)lineWords->get(j);
 
-        if (!frag->line->blk->tableEnd) {
-          ((GooString*) strings->get (actual_line))->append (space, spaceLen);
-        }
-      }
-      // not a table
-      else {
-        page->dumpFragment (frag->line->text + frag->start, frag->len, uMap, s);
-        if (i < nFrags - 1) {
-          s->append (eol, eolLen);
-        }
-      }
-    }
-
-    if (strings != NULL) {
-      for (j = 0; j < strings->getLength (); j++) {
-        s->append((GooString*) strings->get (j));
-        s->append(eol, eolLen);
-        delete ((GooString*) strings->get (j));
-      }
-      delete strings;
-      strings = NULL;
-      actual_table = -1;
-      actual_line = -1;
-      actual_block = NULL;
+      page->dumpFragment (sel->word->text + sel->begin, sel->end - sel->begin, uMap, text);
+      if (j < lineWords->getLength() - 1)
+        text->append(space, spaceLen);
     }
+    if (i < nLines - 1)
+      text->append(eol, eolLen);
   }
 
   uMap->decRefCnt();
 
-  return s;
+  return text;
 }
 
 class TextSelectionSizer : public TextSelectionVisitor {
@@ -4763,6 +4746,7 @@ GooString *TextPage::getSelectionText(PDFRectangle *selection,
   TextSelectionDumper dumper(this);
 
   visitSelection(&dumper, selection, style);
+  dumper.endPage();
 
   return dumper.getText();
 }
commit c849094a2daf896d085937adff1f7659a09da062
Author: Carlos Garcia Campos <carlosgc at gnome.org>
Date:   Mon Jun 24 18:29:11 2013 +0200

    TextOutputDev: Move TextSelection class from TextSelectionPainter to TextSelectionVisitor
    
    So that it can be used by other TextSelectionVisitor implementations.
    Also renamed it as TextWordSelection since it contains a word selection.

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index fe051f6..2872f02 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -3999,6 +3999,21 @@ public:
 			  PDFRectangle *selection) = 0;
 
 protected:
+
+  class TextWordSelection {
+  public:
+    TextWordSelection(TextWord *word, int begin, int end)
+      : word(word),
+        begin(begin),
+        end(end)
+    {
+    }
+
+    TextWord *word;
+    int begin;
+    int end;
+  };
+
   TextPage *page;
 };
 
@@ -4265,20 +4280,6 @@ private:
   GfxState *state;
   GooList *selectionList;
   Matrix ctm, ictm;
-
-  class TextSelection {
-  public:
-    TextSelection(TextWord *word, int begin, int end)
-      : word(word),
-	begin(begin),
-	end(end)
-    {
-    }
-
-    TextWord *word;
-    int begin;
-    int end;
-  };
 };
 
 TextSelectionPainter::TextSelectionPainter(TextPage *page,
@@ -4310,7 +4311,7 @@ TextSelectionPainter::TextSelectionPainter(TextPage *page,
 
 TextSelectionPainter::~TextSelectionPainter()
 {
-  deleteGooList(selectionList, TextSelection);
+  deleteGooList(selectionList, TextWordSelection);
   delete state;
 }
 
@@ -4350,7 +4351,7 @@ void TextSelectionPainter::visitLine (TextLine *line,
 void TextSelectionPainter::visitWord (TextWord *word, int begin, int end,
 				      PDFRectangle *selection)
 {
-  selectionList->append(new TextSelection(word, begin, end));
+  selectionList->append(new TextWordSelection(word, begin, end));
 }
 
 void TextSelectionPainter::endPage()
@@ -4362,7 +4363,7 @@ void TextSelectionPainter::endPage()
   out->updateFillColor(state);
 
   for (int i = 0; i < selectionList->getLength(); i++) {
-    TextSelection *sel = (TextSelection *) selectionList->get(i);
+    TextWordSelection *sel = (TextWordSelection *) selectionList->get(i);
     int begin = sel->begin;
 
     while (begin < sel->end) {


More information about the poppler mailing list