[poppler] Branch 'xpdf303merge' - cpp/poppler-page.cpp glib/poppler-page.cc poppler/ArthurOutputDev.cc poppler/CairoOutputDev.cc poppler/Gfx.cc poppler/PSOutputDev.cc poppler/TextOutputDev.cc poppler/TextOutputDev.h qt4/src test/perf-test.cc utils/pdftotext.1 utils/pdftotext.cc

Albert Astals Cid aacid at kemper.freedesktop.org
Sun Feb 5 15:26:24 PST 2012


 cpp/poppler-page.cpp       |   10 -
 glib/poppler-page.cc       |    5 
 poppler/ArthurOutputDev.cc |    1 
 poppler/CairoOutputDev.cc  |    2 
 poppler/Gfx.cc             |    6 
 poppler/PSOutputDev.cc     |    6 
 poppler/TextOutputDev.cc   |  360 ++++++++++++++++++++++++++++-----------------
 poppler/TextOutputDev.h    |   18 +-
 qt4/src/poppler-page.cc    |   12 -
 test/perf-test.cc          |    2 
 utils/pdftotext.1          |    4 
 utils/pdftotext.cc         |   10 +
 12 files changed, 270 insertions(+), 166 deletions(-)

New commits:
commit e17f09563276ee25b6acfc127b6ea360da650030
Author: Albert Astals Cid <aacid at kde.org>
Date:   Mon Feb 6 00:25:53 2012 +0100

    [xpdf303] TextOutputDev and associated changes

diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 4e2f730..d72a477 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -208,24 +208,24 @@ bool page::search(const ustring &text, rectf &r, search_direction_enum direction
     double rect_right = r.right();
     double rect_bottom = r.bottom();
 
-    TextOutputDev td(NULL, gTrue, gFalse, gFalse);
+    TextOutputDev td(NULL, gTrue, 0, gFalse, gFalse);
     d->doc->doc->displayPage(&td, d->index + 1, 72, 72, rotation_value, false, true, false);
     TextPage *text_page = td.takeText();
 
     switch (direction) {
     case search_from_top:
         found = text_page->findText(&u[0], len,
-                    gTrue, gTrue, gFalse, gFalse, sCase, gFalse,
+                    gTrue, gTrue, gFalse, gFalse, sCase, gFalse, gFalse,
                     &rect_left, &rect_top, &rect_right, &rect_bottom);
         break;
     case search_next_result:
         found = text_page->findText(&u[0], len,
-                    gFalse, gTrue, gTrue, gFalse, sCase, gFalse,
+                    gFalse, gTrue, gTrue, gFalse, sCase, gFalse, gFalse,
                     &rect_left, &rect_top, &rect_right, &rect_bottom);
         break;
     case search_previous_result:
         found = text_page->findText(&u[0], len,
-                    gFalse, gTrue, gTrue, gFalse, sCase, gTrue,
+                    gFalse, gTrue, gTrue, gFalse, sCase, gTrue, gFalse,
                     &rect_left, &rect_top, &rect_right, &rect_bottom);
         break;
     }
@@ -267,7 +267,7 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
 {
     std::auto_ptr<GooString> s;
     const GBool use_raw_order = (layout_mode == raw_order_layout);
-    TextOutputDev td(0, gFalse, use_raw_order, gFalse);
+    TextOutputDev td(0, gFalse, 0, use_raw_order, gFalse);
     d->doc->doc->displayPage(&td, d->index + 1, 72, 72, 0, false, true, false);
     if (r.is_empty()) {
         const PDFRectangle *rect = d->page->getCropBox();
diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 7b98625..a95ff6a 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -263,7 +263,7 @@ poppler_page_get_text_page (PopplerPage *page)
     TextOutputDev *text_dev;
     Gfx           *gfx;
 
-    text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse);
+    text_dev = new TextOutputDev (NULL, gTrue, 0, gFalse, gFalse);
     gfx = page->page->createGfx(text_dev,
 				72.0, 72.0, 0,
 				gFalse, /* useMediaBox */
@@ -888,6 +888,7 @@ poppler_page_find_text (PopplerPage *page,
 			     gFalse, gTrue, // startAtTop, stopAtBottom
 			     gFalse, gFalse, // startAtLast, stopAtLast
 			     gFalse, gFalse, // caseSensitive, backwards
+			     gFalse, // wholeWord
 			     &xMin, &yMin, &xMax, &yMax))
     {
       match = poppler_rectangle_new ();
@@ -1064,7 +1065,7 @@ poppler_page_render_to_ps (PopplerPage   *page,
                                     ps_file->first_page, ps_file->last_page,
                                     psModePS, (int)ps_file->paper_width,
                                     (int)ps_file->paper_height, ps_file->duplex,
-                                    0, 0, 0, 0, gFalse, gFalse);
+                                    0, 0, 0, 0, gFalse);
 
 
   ps_file->document->doc->displayPage (ps_file->out, page->index + 1, 72.0, 72.0,
diff --git a/poppler/ArthurOutputDev.cc b/poppler/ArthurOutputDev.cc
index 92bc84f..301232b 100644
--- a/poppler/ArthurOutputDev.cc
+++ b/poppler/ArthurOutputDev.cc
@@ -292,6 +292,7 @@ void ArthurOutputDev::updateFont(GfxState *state)
   m_font = NULL;
   fileName = NULL;
   tmpBuf = NULL;
+  fontLoc = NULL;
 
   if (!(gfxFont = state->getFont())) {
     goto err1;
diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc
index dc5698e..340b3a3 100644
--- a/poppler/CairoOutputDev.cc
+++ b/poppler/CairoOutputDev.cc
@@ -246,7 +246,7 @@ void CairoOutputDev::startPage(int pageNum, GfxState *state) {
 void CairoOutputDev::endPage() {
   if (text) {
     text->endPage();
-    text->coalesce(gTrue, gFalse);
+    text->coalesce(gTrue, 0, gFalse);
   }
 }
 
diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc
index 4fa9431..46e9c4a 100644
--- a/poppler/Gfx.cc
+++ b/poppler/Gfx.cc
@@ -2866,7 +2866,7 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) {
   GfxColor colorA, colorB;
   double xa, ya, xb, yb, ra, rb;
   double ta, tb, sa, sb;
-  double sz, sMin, sMax, h;
+  double sMin, sMax, h;
   double sLeft, sRight, sTop, sBottom, sZero, sDiag;
   GBool haveSLeft, haveSRight, haveSTop, haveSBottom, haveSZero;
   GBool haveSMin, haveSMax;
@@ -2888,18 +2888,14 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) {
   if (h == 0) {
     enclosed = gTrue;
     theta = 0; // make gcc happy
-    sz = 0; // make gcc happy
   } else if (r1 - r0 == 0) {
     enclosed = gFalse;
     theta = 0;
-    sz = 0; // make gcc happy
   } else if (fabs(r1 - r0) >= h) {
     enclosed = gTrue;
     theta = 0; // make gcc happy
-    sz = 0; // make gcc happy
   } else {
     enclosed = gFalse;
-    sz = -r0 / (r1 - r0);
     theta = asin((r1 - r0) / h);
   }
 
diff --git a/poppler/PSOutputDev.cc b/poppler/PSOutputDev.cc
index 7dbac8a..d91cae9 100644
--- a/poppler/PSOutputDev.cc
+++ b/poppler/PSOutputDev.cc
@@ -4178,7 +4178,7 @@ GBool PSOutputDev::radialShadedFill(GfxState *state, GfxRadialShading *shading,
   double xMin, yMin, xMax, yMax;
   double x0, y0, r0, x1, y1, r1, t0, t1;
   double xa, ya, ra;
-  double sz, sMin, sMax, h, ta;
+  double sMin, sMax, h, ta;
   double sLeft, sRight, sTop, sBottom, sZero, sDiag;
   GBool haveSLeft, haveSRight, haveSTop, haveSBottom, haveSZero;
   GBool haveSMin, haveSMax;
@@ -4206,18 +4206,14 @@ GBool PSOutputDev::radialShadedFill(GfxState *state, GfxRadialShading *shading,
   if (h == 0) {
     enclosed = gTrue;
     theta = 0; // make gcc happy
-    sz = 0; // make gcc happy
   } else if (r1 - r0 == 0) {
     enclosed = gFalse;
     theta = 0;
-    sz = 0; // make gcc happy
   } else if (fabs(r1 - r0) >= h) {
     enclosed = gTrue;
     theta = 0; // make gcc happy
-    sz = 0; // make gcc happy
   } else {
     enclosed = gFalse;
-    sz = -r0 / (r1 - r0);
     theta = asin((r1 - r0) / h);
   }
   if (enclosed) {
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 7a0b8d9..531617d 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -663,7 +663,7 @@ void TextPool::addWord(TextWord *word) {
 
   // insert the new word
   if (cursor && wordBaseIdx == cursorBaseIdx &&
-      word->primaryCmp(cursor) > 0) {
+      word->primaryCmp(cursor) >= 0) {
     w0 = cursor;
     w1 = cursor->next;
   } else {
@@ -1053,7 +1053,7 @@ void TextLineFrag::computeCoords(GBool oneRot) {
 	xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
 	yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
 	yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
-	base = blk->yMin + base * (blk->yMax - blk->yMin);
+	base = blk->yMin + d4 * (blk->yMax - blk->yMin);
 	break;
       case 1:
 	xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
@@ -1277,15 +1277,15 @@ void TextBlock::addWord(TextWord *word) {
   }
 }
 
-void TextBlock::coalesce(UnicodeMap *uMap) {
+void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) {
   TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
   TextLine *line, *line0, *line1;
   int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
   int baseIdx, bestWordBaseIdx, idx0, idx1;
   double minBase, maxBase;
-  double fontSize, delta, priDelta, secDelta;
+  double fontSize, wordSpacing, delta, priDelta, secDelta;
   TextLine **lineArray;
-  GBool found;
+  GBool found, overlap;
   int col1, col2;
   int i, j, k;
 
@@ -1295,11 +1295,7 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
     while (word0) {
       priDelta = dupMaxPriDelta * word0->fontSize;
       secDelta = dupMaxSecDelta * word0->fontSize;
-      if (rot == 0 || rot == 3) {
-	maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
-      } else {
-	maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
-      }
+      maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
       found = gFalse;
       word1 = word2 = NULL; // make gcc happy
       for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
@@ -1396,6 +1392,7 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
     maxBase = word0->base + maxIntraLineDelta * fontSize;
     minBaseIdx = pool->getBaseIdx(minBase);
     maxBaseIdx = pool->getBaseIdx(maxBase);
+    wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
 
     // find the rest of the words in this line
     while (1) {
@@ -1404,25 +1401,32 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
       // this line
       bestWordBaseIdx = 0;
       bestWord0 = bestWord1 = NULL;
-      for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
+      overlap = gFalse;
+      for (baseIdx = minBaseIdx;
+	   !overlap && baseIdx <= maxBaseIdx;
+	   ++baseIdx) {
 	for (word0 = NULL, word1 = pool->getPool(baseIdx);
 	     word1;
 	     word0 = word1, word1 = word1->next) {
 	  if (word1->base >= minBase &&
-	      word1->base <= maxBase &&
-	      (delta = lastWord->primaryDelta(word1)) >=
-	        minCharSpacing * fontSize) {
-	    if (delta < maxWordSpacing * fontSize &&
-		(!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
-	      bestWordBaseIdx = baseIdx;
-	      bestWord0 = word0;
-	      bestWord1 = word1;
+	      word1->base <= maxBase) {
+	    delta = lastWord->primaryDelta(word1);
+	    if (delta < minCharSpacing * fontSize) {
+	      overlap = gTrue;
+	      break;
+	    } else {
+	      if (delta < wordSpacing &&
+		  (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
+		bestWordBaseIdx = baseIdx;
+		bestWord0 = word0;
+		bestWord1 = word1;
+	      }
+	      break;
 	    }
-	    break;
 	  }
 	}
       }
-      if (!bestWord1) {
+      if (overlap || !bestWord1) {
 	break;
       }
 
@@ -1469,52 +1473,79 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
 
   // column assignment
   nColumns = 0;
-  for (i = 0; i < nLines; ++i) {
-    line0 = lineArray[i];
-    col1 = 0;
-    for (j = 0; j < i; ++j) {
-      line1 = lineArray[j];
-      if (line1->primaryDelta(line0) >= 0) {
-	col2 = line1->col[line1->len] + 1;
-      } else {
-	k = 0; // make gcc happy
-	switch (rot) {
-	case 0:
-	  for (k = 0;
-	       k < line1->len &&
-		 line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
-	       ++k) ;
-	  break;
-	case 1:
-	  for (k = 0;
-	       k < line1->len &&
-		 line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
-	       ++k) ;
-	  break;
-	case 2:
-	  for (k = 0;
-	       k < line1->len &&
-		 line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
-	       ++k) ;
-	  break;
-	case 3:
-	  for (k = 0;
-	       k < line1->len &&
-		 line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
-	       ++k) ;
-	  break;
-	}
-	col2 = line1->col[k];
+  if (fixedPitch) {
+    for (i = 0; i < nLines; ++i) {
+      line0 = lineArray[i];
+      col1 = 0; // make gcc happy
+      switch (rot) {
+      case 0:
+	col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
+	break;
+      case 1:
+	col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
+	break;
+      case 2:
+	col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
+	break;
+      case 3:
+	col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
+	break;
       }
-      if (col2 > col1) {
-	col1 = col2;
+      for (k = 0; k <= line0->len; ++k) {
+	line0->col[k] += col1;
+      }
+      if (line0->col[line0->len] > nColumns) {
+	nColumns = line0->col[line0->len];
       }
     }
-    for (k = 0; k <= line0->len; ++k) {
-      line0->col[k] += col1;
-    }
-    if (line0->col[line0->len] > nColumns) {
-      nColumns = line0->col[line0->len];
+  } else {
+    for (i = 0; i < nLines; ++i) {
+      line0 = lineArray[i];
+      col1 = 0;
+      for (j = 0; j < i; ++j) {
+	line1 = lineArray[j];
+	if (line1->primaryDelta(line0) >= 0) {
+	  col2 = line1->col[line1->len] + 1;
+	} else {
+	  k = 0; // make gcc happy
+	  switch (rot) {
+	  case 0:
+	    for (k = 0;
+		 k < line1->len &&
+		   line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+		 ++k) ;
+	    break;
+	  case 1:
+	    for (k = 0;
+		 k < line1->len &&
+		   line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+		 ++k) ;
+	    break;
+	  case 2:
+	    for (k = 0;
+		 k < line1->len &&
+		   line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+		 ++k) ;
+	    break;
+	  case 3:
+	    for (k = 0;
+		 k < line1->len &&
+		   line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+		 ++k) ;
+	    break;
+	  }
+	  col2 = line1->col[k];
+	}
+	if (col2 > col1) {
+	  col1 = col2;
+	}
+      }
+      for (k = 0; k <= line0->len; ++k) {
+	line0->col[k] += col1;
+      }
+      if (line0->col[line0->len] > nColumns) {
+	nColumns = line0->col[line0->len];
+      }
     }
   }
   gfree(lineArray);
@@ -2111,6 +2142,8 @@ void TextPage::clear() {
     gfree(blocks);
   }
   deleteGooList(fonts, TextFontInfo);
+  deleteGooList(underlines, TextUnderline);
+  deleteGooList(links, TextLink);
 
   curWord = NULL;
   charPos = 0;
@@ -2128,6 +2161,8 @@ void TextPage::clear() {
   rawWords = NULL;
   rawLastWord = NULL;
   fonts = new GooList();
+  underlines = new GooList();
+  links = new GooList();
 }
 
 void TextPage::updateFont(GfxState *state) {
@@ -2426,7 +2461,7 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
   links->append(new TextLink(xMin, yMin, xMax, yMax, link));
 }
 
-void TextPage::coalesce(GBool physLayout, GBool doHTML) {
+void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) {
   UnicodeMap *uMap;
   TextPool *pool;
   TextWord *word0, *word1, *word2;
@@ -2454,7 +2489,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
   blkList = NULL;
   lastBlk = NULL;
   nBlocks = 0;
-  primaryRot = -1;
+  primaryRot = 0;
 
 #if 0 // for debugging
   printf("*** initial words ***\n");
@@ -2918,7 +2953,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
       //~ addition to primary rotation
 
       // coalesce the block, and add it to the list
-      blk->coalesce(uMap);
+      blk->coalesce(uMap, fixedPitch);
       if (lastBlk) {
 	lastBlk->next = blk;
       } else {
@@ -2926,11 +2961,12 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
       }
       lastBlk = blk;
       count[rot] += blk->charCount;
-      if (primaryRot < 0 || count[rot] > count[primaryRot]) {
-	primaryRot = rot;
-      }
       ++nBlocks;
     }
+
+    if (count[rot] > count[primaryRot]) {
+      primaryRot = rot;
+    }
   }
 
 #if 0 // for debugging
@@ -2992,75 +3028,108 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
   // sort blocks into xy order for column assignment
   if (blocks)
     gfree (blocks);
-  blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
-  for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
-    blocks[i] = blk;
-  }
-  qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
+  if (physLayout && fixedPitch) {
 
-  // column assignment
-  for (i = 0; i < nBlocks; ++i) {
-    blk0 = blocks[i];
-    col1 = 0;
-    for (j = 0; j < i; ++j) {
-      blk1 = blocks[j];
-      col2 = 0; // make gcc happy
+    blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
+    for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
+      blocks[i] = blk;
+      col1 = 0; // make gcc happy
       switch (primaryRot) {
       case 0:
-	if (blk0->xMin > blk1->xMax) {
-	  col2 = blk1->col + blk1->nColumns + 3;
-	} else if (blk1->xMax == blk1->xMin) {
-	  col2 = blk1->col;
-	} else {
-	  col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
-				    (blk1->xMax - blk1->xMin)) *
-				   blk1->nColumns);
-	}
+	col1 = (int)(blk->xMin / fixedPitch + 0.5);
 	break;
       case 1:
-	if (blk0->yMin > blk1->yMax) {
-	  col2 = blk1->col + blk1->nColumns + 3;
-	} else if (blk1->yMax == blk1->yMin) {
-	  col2 = blk1->col;
-	} else {
-	  col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
-				    (blk1->yMax - blk1->yMin)) *
-				   blk1->nColumns);
-	}
+	col1 = (int)(blk->yMin / fixedPitch + 0.5);
 	break;
       case 2:
-	if (blk0->xMax < blk1->xMin) {
-	  col2 = blk1->col + blk1->nColumns + 3;
-	} else if (blk1->xMin == blk1->xMax) {
-	  col2 = blk1->col;
-	} else {
-	  col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
-				    (blk1->xMin - blk1->xMax)) *
-				   blk1->nColumns);
-	}
+	col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5);
 	break;
       case 3:
-	if (blk0->yMax < blk1->yMin) {
-	  col2 = blk1->col + blk1->nColumns + 3;
-	} else if (blk1->yMin == blk1->yMax) {
-	  col2 = blk1->col;
-	} else {
-	  col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
-				    (blk1->yMin - blk1->yMax)) *
-				   blk1->nColumns);
-	}
+	col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5);
 	break;
       }
-      if (col2 > col1) {
-	col1 = col2;
+      blk->col = col1;
+      for (line = blk->lines; line; line = line->next) {
+	for (j = 0; j <= line->len; ++j) {
+	  line->col[j] += col1;
+	}
       }
     }
-    blk0->col = col1;
-    for (line = blk0->lines; line; line = line->next) {
-      for (j = 0; j <= line->len; ++j) {
-	line->col[j] += col1;
+
+  } else {
+
+    // sort blocks into xy order for column assignment
+    blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
+    for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
+      blocks[i] = blk;
+    }
+    qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
+
+    // column assignment
+    for (i = 0; i < nBlocks; ++i) {
+      blk0 = blocks[i];
+      col1 = 0;
+      for (j = 0; j < i; ++j) {
+	blk1 = blocks[j];
+	col2 = 0; // make gcc happy
+	switch (primaryRot) {
+	case 0:
+	  if (blk0->xMin > blk1->xMax) {
+	    col2 = blk1->col + blk1->nColumns + 3;
+	  } else if (blk1->xMax == blk1->xMin) {
+	    col2 = blk1->col;
+	  } else {
+	    col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
+				      (blk1->xMax - blk1->xMin)) *
+				     blk1->nColumns);
+	  }
+	  break;
+	case 1:
+	  if (blk0->yMin > blk1->yMax) {
+	    col2 = blk1->col + blk1->nColumns + 3;
+	  } else if (blk1->yMax == blk1->yMin) {
+	    col2 = blk1->col;
+	  } else {
+	    col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
+				      (blk1->yMax - blk1->yMin)) *
+				     blk1->nColumns);
+	  }
+	  break;
+	case 2:
+	  if (blk0->xMax < blk1->xMin) {
+	    col2 = blk1->col + blk1->nColumns + 3;
+	  } else if (blk1->xMin == blk1->xMax) {
+	    col2 = blk1->col;
+	  } else {
+	    col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
+				      (blk1->xMin - blk1->xMax)) *
+				     blk1->nColumns);
+	  }
+	  break;
+	case 3:
+	  if (blk0->yMax < blk1->yMin) {
+	    col2 = blk1->col + blk1->nColumns + 3;
+	  } else if (blk1->yMin == blk1->yMax) {
+	    col2 = blk1->col;
+	  } else {
+	    col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
+				      (blk1->yMin - blk1->yMax)) *
+				     blk1->nColumns);
+	  }
+	  break;
+	}
+	if (col2 > col1) {
+	  col1 = col2;
+	}
+      }
+      blk0->col = col1;
+      for (line = blk0->lines; line; line = line->next) {
+	for (j = 0; j <= line->len; ++j) {
+	  line->col[j] += col1;
+	}
       }
     }
+
   }
 
 #if 0 // for debugging
@@ -3070,7 +3139,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
 	   blk->nColumns);
     for (line = blk->lines; line; line = line->next) {
-      printf("  line:\n");
+      printf("  line: col[0]=%d\n", line->col[0]);
       for (word0 = line->words; word0; word0 = word0->next) {
 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
@@ -3470,6 +3539,7 @@ GBool TextPage::findText(Unicode *s, int len,
 			 GBool startAtTop, GBool stopAtBottom,
 			 GBool startAtLast, GBool stopAtLast,
 			 GBool caseSensitive, GBool backward,
+			 GBool wholeWord,
 			 double *xMin, double *yMin,
 			 double *xMax, double *yMax) {
   TextBlock *blk;
@@ -3527,25 +3597,35 @@ GBool TextPage::findText(Unicode *s, int len,
     blk = blocks[i];
 
     // check: is the block above the top limit?
-    if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
+    // (this only works if the page's primary rotation is zero --
+    // otherwise the blocks won't be sorted in the useful order)
+    if (!startAtTop && primaryRot == 0 &&
+	(backward ? blk->yMin > yStart : blk->yMax < yStart)) {
       continue;
     }
 
     // check: is the block below the bottom limit?
-    if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
+    // (this only works if the page's primary rotation is zero --
+    // otherwise the blocks won't be sorted in the useful order)
+    if (!stopAtBottom && primaryRot == 0 &&
+	(backward ? blk->yMax < yStop : blk->yMin > yStop)) {
       break;
     }
 
     for (line = blk->lines; line; line = line->next) {
 
       // check: is the line above the top limit?
-      if (!startAtTop &&
+      // (this only works if the page's primary rotation is zero --
+      // otherwise the lines won't be sorted in the useful order)
+      if (!startAtTop && primaryRot == 0 &&
 	  (backward ? line->yMin > yStart : line->yMin < yStart)) {
 	continue;
       }
 
       // check: is the line below the bottom limit?
-      if (!stopAtBottom &&
+      // (this only works if the page's primary rotation is zero --
+      // otherwise the lines won't be sorted in the useful order)
+      if (!stopAtBottom && primaryRot == 0 &&
 	  (backward ? line->yMin < yStop : line->yMin > yStop)) {
 	continue;
       }
@@ -3564,9 +3644,9 @@ GBool TextPage::findText(Unicode *s, int len,
 	for (k = 0; k < m; ++k) {
 	  txt[k] = unicodeToUpper(line->normalized[k]);
 	  }
-	  } else {
+      } else {
 	txt = line->normalized;
-	  }
+      }
 
       // search each position in this line
       j = backward ? m - len : 0;
@@ -5211,9 +5291,11 @@ static void TextOutputDev_outputToFile(void *stream, const char *text, int len)
 }
 
 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
-			     GBool rawOrderA, GBool append) {
+			     double fixedPitchA, GBool rawOrderA,
+			     GBool append) {
   text = NULL;
   physLayout = physLayoutA;
+  fixedPitch = physLayout ? fixedPitchA : 0;
   rawOrder = rawOrderA;
   doHTML = gFalse;
   ok = gTrue;
@@ -5246,11 +5328,13 @@ TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
 }
 
 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
-			     GBool physLayoutA, GBool rawOrderA) {
+			     GBool physLayoutA, double fixedPitchA,
+			     GBool rawOrderA) {
   outputFunc = func;
   outputStream = stream;
   needClose = gFalse;
   physLayout = physLayoutA;
+  fixedPitch = physLayout ? fixedPitchA : 0;
   rawOrder = rawOrderA;
   doHTML = gFalse;
   text = new TextPage(rawOrderA);
@@ -5277,12 +5361,16 @@ void TextOutputDev::startPage(int pageNum, GfxState *state) {
 
 void TextOutputDev::endPage() {
   text->endPage();
-  text->coalesce(physLayout, doHTML);
+  text->coalesce(physLayout, fixedPitch, doHTML);
   if (outputStream) {
     text->dump(outputStream, outputFunc, physLayout);
   }
 }
 
+void TextOutputDev::restoreState(GfxState *state) {
+  text->updateFont(state);
+}
+
 void TextOutputDev::updateFont(GfxState *state) {
   text->updateFont(state);
 }
@@ -5465,10 +5553,12 @@ GBool TextOutputDev::findText(Unicode *s, int len,
 			      GBool startAtTop, GBool stopAtBottom,
 			      GBool startAtLast, GBool stopAtLast,
 			      GBool caseSensitive, GBool backward,
+			      GBool wholeWord,
 			      double *xMin, double *yMin,
 			      double *xMax, double *yMax) {
   return text->findText(s, len, startAtTop, stopAtBottom,
-			startAtLast, stopAtLast, caseSensitive, backward,
+			startAtLast, stopAtLast,
+			caseSensitive, backward, wholeWord,
 			xMin, yMin, xMax, yMax);
 }
 
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index fd34c8b..e31876b 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -333,7 +333,7 @@ public:
 
   void addWord(TextWord *word);
 
-  void coalesce(UnicodeMap *uMap);
+  void coalesce(UnicodeMap *uMap, double fixedPitch);
 
   // Update this block's priMin and priMax values, looking at <blk>.
   void updatePriMinMax(TextBlock *blk);
@@ -521,7 +521,7 @@ public:
   void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link);
 
   // Coalesce strings that look like parts of the same line.
-  void coalesce(GBool physLayout, GBool doHTML);
+  void coalesce(GBool physLayout, double fixedPitch, GBool doHTML);
 
   // Find a string.  If <startAtTop> is true, starts looking at the
   // top of the page; else if <startAtLast> is true, starts looking
@@ -534,6 +534,7 @@ public:
 		 GBool startAtTop, GBool stopAtBottom,
 		 GBool startAtLast, GBool stopAtLast,
 		 GBool caseSensitive, GBool backward,
+		 GBool wholeWord,
 		 double *xMin, double *yMin,
 		 double *xMax, double *yMax);
 
@@ -676,14 +677,16 @@ public:
   // is maintained.  If <rawOrder> is true, the text is kept in
   // content stream order.
   TextOutputDev(char *fileName, GBool physLayoutA,
-		GBool rawOrderA, GBool append);
+		double fixedPitchA, GBool rawOrderA,
+		GBool append);
 
   // Create a TextOutputDev which will write to a generic stream.  If
   // <physLayoutA> is true, the original physical layout of the text
   // is maintained.  If <rawOrder> is true, the text is kept in
   // content stream order.
   TextOutputDev(TextOutputFunc func, void *stream,
-		GBool physLayoutA, GBool rawOrderA);
+		GBool physLayoutA, double fixedPitchA,
+		GBool rawOrderA);
 
   // Destructor.
   virtual ~TextOutputDev();
@@ -719,6 +722,9 @@ public:
   // End a page.
   virtual void endPage();
 
+  //----- save/restore graphics state
+  virtual void restoreState(GfxState *state);
+
   //----- update text state
   virtual void updateFont(GfxState *state);
 
@@ -754,6 +760,7 @@ public:
 		 GBool startAtTop, GBool stopAtBottom,
 		 GBool startAtLast, GBool stopAtLast,
 		 GBool caseSensitive, GBool backward,
+		 GBool wholeWord,
 		 double *xMin, double *yMin,
 		 double *xMax, double *yMax);
 
@@ -804,6 +811,9 @@ private:
   TextPage *text;		// text for the current page
   GBool physLayout;		// maintain original physical layout when
 				//   dumping text
+  double fixedPitch;		// if physLayout is true and this is non-zero,
+				//   assume fixed-pitch characters with this
+				//   width
   GBool rawOrder;		// keep text in content stream order
   GBool doHTML;			// extra processing for HTML conversion
   GBool ok;			// set up ok?
diff --git a/qt4/src/poppler-page.cc b/qt4/src/poppler-page.cc
index 9dc1d15..398a69b 100644
--- a/qt4/src/poppler-page.cc
+++ b/qt4/src/poppler-page.cc
@@ -330,7 +330,7 @@ QString Page::text(const QRectF &r, TextLayout textLayout) const
   QString result;
   
   const GBool rawOrder = textLayout == RawOrderLayout;
-  output_dev = new TextOutputDev(0, gFalse, rawOrder, gFalse);
+  output_dev = new TextOutputDev(0, gFalse, 0, rawOrder, gFalse);
   m_page->parentDoc->doc->displayPageSlice(output_dev, m_page->index + 1, 72, 72,
       0, false, true, false, -1, -1, -1, -1);
   if (r.isNull())
@@ -371,19 +371,19 @@ bool Page::search(const QString &text, double &sLeft, double &sTop, double &sRig
   int rotation = (int)rotate * 90;
 
   // fetch ourselves a textpage
-  TextOutputDev td(NULL, gTrue, gFalse, gFalse);
+  TextOutputDev td(NULL, gTrue, 0, gFalse, gFalse);
   m_page->parentDoc->doc->displayPage( &td, m_page->index + 1, 72, 72, rotation, false, true, false );
   TextPage *textPage=td.takeText();
 
   if (direction == FromTop)
     found = textPage->findText( u.data(), len, 
-            gTrue, gTrue, gFalse, gFalse, sCase, gFalse, &sLeft, &sTop, &sRight, &sBottom );
+            gTrue, gTrue, gFalse, gFalse, sCase, gFalse, gFalse, &sLeft, &sTop, &sRight, &sBottom );
   else if ( direction == NextResult )
     found = textPage->findText( u.data(), len, 
-            gFalse, gTrue, gTrue, gFalse, sCase, gFalse, &sLeft, &sTop, &sRight, &sBottom );
+            gFalse, gTrue, gTrue, gFalse, sCase, gFalse, gFalse, &sLeft, &sTop, &sRight, &sBottom );
   else if ( direction == PreviousResult )
     found = textPage->findText( u.data(), len, 
-            gFalse, gTrue, gTrue, gFalse, sCase, gTrue, &sLeft, &sTop, &sRight, &sBottom );
+            gFalse, gTrue, gTrue, gFalse, sCase, gTrue, gFalse, &sLeft, &sTop, &sRight, &sBottom );
 
   textPage->decRefCnt();
 
@@ -414,7 +414,7 @@ QList<TextBox*> Page::textList(Rotation rotate) const
   
   QList<TextBox*> output_list;
   
-  output_dev = new TextOutputDev(0, gFalse, gFalse, gFalse);
+  output_dev = new TextOutputDev(0, gFalse, 0, gFalse, gFalse);
   
   int rotation = (int)rotate * 90;
 
diff --git a/test/perf-test.cc b/test/perf-test.cc
index b6fb8f8..6bdda97 100644
--- a/test/perf-test.cc
+++ b/test/perf-test.cc
@@ -840,7 +840,7 @@ static void RenderPdfAsText(const char *fileName)
 
     LogInfo("started: %s\n", fileName);
 
-    TextOutputDev * textOut = new TextOutputDev(NULL, gTrue, gFalse, gFalse);
+    TextOutputDev * textOut = new TextOutputDev(NULL, gTrue, 0, gFalse, gFalse);
     if (!textOut->isOk()) {
         delete textOut;
         return;
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 587f1a9..88fbf70 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -53,6 +53,10 @@ Maintain (as best as possible) the original physical layout of the
 text.  The default is to \'undo' physical layout (columns,
 hyphenation, etc.) and output the text in reading order.
 .TP
+.BI \-fixed " number"
+Assume fixed-pitch (or tabular) text, with the specified character
+width (in points).  This forces physical layout mode.
+.TP
 .B \-raw
 Keep the text in content stream order.  This is a hack which often
 "undoes" column formatting, etc.  Use of raw mode is no longer
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 2e7b32e..a170f1b 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -68,6 +68,7 @@ static int w = 0;
 static int h = 0;
 static GBool bbox = gFalse;
 static GBool physLayout = gFalse;
+static double fixedPitch = 0;
 static GBool rawOrder = gFalse;
 static GBool htmlMeta = gFalse;
 static char textEncName[128] = "";
@@ -97,6 +98,8 @@ static const ArgDesc argDesc[] = {
    "height of crop area in pixels (default is 0)"},
   {"-layout",  argFlag,     &physLayout,    0,
    "maintain original physical layout"},
+  {"-fixed",   argFP,       &fixedPitch,    0,
+   "assume fixed-pitch (or tabular) text"},
   {"-raw",     argFlag,     &rawOrder,      0,
    "keep strings in content stream order"},
   {"-htmlmeta", argFlag,   &htmlMeta,       0,
@@ -197,6 +200,9 @@ int main(int argc, char *argv[]) {
   }
 
   fileName = new GooString(argv[1]);
+  if (fixedPitch) {
+    physLayout = gTrue;
+  }
 
   if (textEncName[0]) {
     globalParams->setTextEncoding(textEncName);
@@ -333,7 +339,7 @@ int main(int argc, char *argv[]) {
 
   // write text file
   if (bbox) {
-    textOut = new TextOutputDev(NULL, physLayout, rawOrder, htmlMeta);
+    textOut = new TextOutputDev(NULL, physLayout, fixedPitch, rawOrder, htmlMeta);
     if (!(f = fopen(textFileName->getCString(), "ab"))) {
       error(errIO, -1, "Couldn't open text file '{0:t}' for append", textFileName);
       exitCode = 2;
@@ -367,7 +373,7 @@ int main(int argc, char *argv[]) {
     fclose(f);
   } else {
     textOut = new TextOutputDev(textFileName->getCString(),
-				physLayout, rawOrder, htmlMeta);
+				physLayout, fixedPitch, rawOrder, htmlMeta);
     if (textOut->isOk()) {
       if ((w==0) && (h==0) && (x==0) && (y==0)) {
 	doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,


More information about the poppler mailing list