[PATCH 1/2] Selecting text in raw order

Daniel Garcia danigm at yaco.es
Thu Sep 2 02:20:07 PDT 2010


---
 glib/poppler-page.cc     |    2 +-
 poppler/TextOutputDev.cc |  245 ++++++++++++++++++++++++++++++++++++++++++++--
 poppler/TextOutputDev.h  |    8 ++
 3 files changed, 245 insertions(+), 10 deletions(-)

diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index bc95e65..287726b 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -240,7 +240,7 @@ poppler_page_get_text_page (PopplerPage *page)
     TextOutputDev *text_dev;
     Gfx           *gfx;
 
-    text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse);
+    text_dev = new TextOutputDev (NULL, gTrue, gTrue, gFalse);
     gfx = page->page->createGfx(text_dev,
 				72.0, 72.0, 0,
 				gFalse, /* useMediaBox */
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 576bcc9..c7107ad 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -416,6 +416,27 @@ inline int TextWord::primaryCmp(TextWord *word) {
   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
 }
 
+inline int TextWord::secondaryCmp(TextWord *word) {
+  double cmp;
+
+  cmp = 0; // make gcc happy
+  switch (rot) {
+  case 0:
+    cmp = yMin - word->yMin;
+    break;
+  case 1:
+    cmp = xMin - word->xMin;
+    break;
+  case 2:
+    cmp = word->yMax - yMax;
+    break;
+  case 3:
+    cmp = word->xMax - xMax;
+    break;
+  }
+  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
 double TextWord::primaryDelta(TextWord *word) {
   double delta;
 
@@ -2361,6 +2382,24 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
   if (rawOrder) {
     primaryRot = 0;
     primaryLR = gTrue;
+
+    // determine the primary direction
+    lrCount = 0;
+    TextWordList *wordlist = makeWordList(gFalse);
+    if (wordlist->getLength()) {
+      for (word0 = wordlist->get(0); word0; word0 = word0->next) {
+        for (i = 0; i < word0->len; ++i) {
+          if (unicodeTypeL(word0->text[i])) {
+            ++lrCount;
+          } else if (unicodeTypeR(word0->text[i])) {
+            --lrCount;
+          }
+        }
+      }
+      primaryLR = lrCount >= 0;
+    }
+    delete wordlist;
+
     return;
   }
 
@@ -4105,6 +4144,7 @@ public:
 			  PDFRectangle *selection);
   virtual void visitWord (TextWord *word, int begin, int end,
 			  PDFRectangle *selection);
+  void drawRegion (PDFRectangle *region);
 
 private:
   OutputDev *out;
@@ -4184,6 +4224,21 @@ void TextSelectionPainter::visitLine (TextLine *line,
   state->clearPath();
 }
 
+void TextSelectionPainter::drawRegion (PDFRectangle *region)
+{
+  state->setFillColor(box_color);
+  out->updateFillColor(state);
+
+  state->moveTo(region->x1, region->y1);
+  state->lineTo(region->x2, region->y1);
+  state->lineTo(region->x2, region->y2);
+  state->lineTo(region->x1, region->y2);
+  state->closePath();
+
+  out->fill(state);
+  state->clearPath();
+}
+
 void TextSelectionPainter::visitWord (TextWord *word, int begin, int end,
 				      PDFRectangle *selection)
 {
@@ -4543,6 +4598,73 @@ void TextPage::visitSelection(TextSelectionVisitor *visitor,
   }
 }
 
+void TextPage::getSelectionWordLimits(PDFRectangle *selection,
+                                      SelectionStyle style,
+                                      double scale,
+                                      int *first,
+                                      int *last,
+                                      int *first_c,
+                                      int *last_c) {
+  TextWordList *wordlist = makeWordList(gFalse);
+  TextWord *word = NULL;
+  double distance, minor=-1, minor1=-1;
+  double xmin, ymin, xmax, ymax;
+  double x1, y1, x2, y2;
+  int tmp;
+
+  x1 = selection->x1;
+  x2 = selection->x2;
+
+  y1 = selection->y1;
+  y2 = selection->y2;
+
+  for (int i=0; i<wordlist->getLength(); i++) {
+    word = wordlist->get(i);
+
+    for (int j=0; j<word->getLength(); j++) {
+      word->getCharBBox(j, &xmin, &ymin, &xmax, &ymax);
+
+      distance = fabs(x1 - xmin) + fabs(y1 - ymin);
+      if (minor < 0 || distance < minor) {
+        *first = i;
+        *first_c = j;
+        minor = distance;
+      }
+
+      distance = fabs(x1 - xmin) + fabs(y1 - ymax);
+      if (minor < 0 || distance < minor) {
+        *first = i;
+        *first_c = j;
+        minor = distance;
+      }
+
+      distance = fabs(x2 - xmax) + fabs(y2 - ymax);
+      if (minor1 < 0 || distance < minor1) {
+        *last = i;
+        *last_c = j;
+        minor1 = distance;
+      }
+    }
+  }
+  if (*first > *last) {
+    tmp = *last;
+    *last = *first;
+    *first = tmp;
+
+    tmp = *last_c;
+    *last_c = *first_c;
+    *first_c = tmp;
+  }
+
+  if (*first == *last && *first_c > *last_c) {
+    tmp = *last_c;
+    *last_c = *first_c;
+    *first_c = tmp;
+  }
+
+  delete wordlist;
+}
+
 void TextPage::drawSelection(OutputDev *out,
 			     double scale,
 			     int rotation,
@@ -4550,30 +4672,135 @@ void TextPage::drawSelection(OutputDev *out,
 			     SelectionStyle style,
 			     GfxColor *glyph_color, GfxColor *box_color)
 {
-  TextSelectionPainter painter(this, scale, rotation, 
-			       out, box_color, glyph_color);
+  TextSelectionPainter painter(this, scale, rotation,
+                               out, box_color, glyph_color);
+  int first, last, first_c, last_c, begin, end;
+  TextWordList *wordlist = makeWordList(gFalse);
+  TextWord *word = NULL;
+  PDFRectangle *rect;
+  GooList *rlist;
+
+  getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c);
+  rlist = getSelectionRegion(selection, style, scale);
+  for(int i=0; i<rlist->getLength(); i++) {
+    rect = (PDFRectangle *)rlist->get(i);
+    painter.drawRegion(rect);
+  }
+
+  for(int i=first; i<=last; i++) {
+    word = wordlist->get(i);
+    if (i == first && i == last) {
+      begin = first_c;
+      end = last_c + 1;
+    } else if (i == first) {
+      begin = first_c;
+      end = word->getLength();
+    } else if (i == last) {
+      begin = 0;
+      end = last_c + 1;
+    } else {
+      begin = 0;
+      end = word->getLength();
+    }
+
+    painter.visitWord(word, begin, end, selection);
+  }
 
-  visitSelection(&painter, selection, style);
+  delete wordlist;
 }
 
 GooList *TextPage::getSelectionRegion(PDFRectangle *selection,
 				      SelectionStyle style,
 				      double scale) {
-  TextSelectionSizer sizer(this, scale);
+  GooList *ret = new GooList();
+  PDFRectangle *rect = NULL;
+  TextWordList *wordlist = makeWordList(gFalse);
+  TextWord *word=NULL, *prevword=NULL;
+  int first=0, last=0, first_c=0, last_c=0;
+  double xmin, ymin, xmax, ymax;
+  double xmin1, ymin1, xmax1, ymax1;
+
+  getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c);
+
+  for (int i=first; i<=last; i++) {
+    word = wordlist->get(i);
+    if (prevword && !word->secondaryCmp(prevword) && rect) {
+      if (i == last) {
+        word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax);
+      }
+      else {
+        word->getBBox(&xmin, &ymin, &xmax, &ymax);
+      }
+      rect->x2 = xmax;
+      continue;
+    }
+
+    if (i == first && i == last) {
+      word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+      word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax);
+      rect = new PDFRectangle(xmin1, ymin1, xmax, ymax);
+      ret->append(rect);
+    } else if (i == first) {
+      word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+      word->getBBox(&xmin, &ymin, &xmax, &ymax);
+      rect = new PDFRectangle(xmin1, ymin1, xmax, ymax);
+      ret->append(rect);
+    } else if (i == last) {
+      word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1);
+      word->getBBox(&xmin, &ymin, &xmax, &ymax);
+      rect = new PDFRectangle(xmin, ymin, xmax1, ymax1);
+      ret->append(rect);
+    } else {
+      word->getBBox(&xmin, &ymin, &xmax, &ymax);
+      rect = new PDFRectangle(xmin, ymin, xmax, ymax);
+      ret->append(rect);
+    }
+    prevword = word;
+  }
 
-  visitSelection(&sizer, selection, style);
+  delete wordlist;
 
-  return sizer.getRegion();
+  return ret;
 }
 
 GooString *TextPage::getSelectionText(PDFRectangle *selection,
 				      SelectionStyle style)
 {
-  TextSelectionDumper dumper(this);
+  GooString *ret = new GooString();
+  TextWordList *wordlist = makeWordList(gFalse);
+  TextWord *word=NULL, *prevword=NULL;
+  int first=0, last=0, first_c=0, last_c=0;
+  UnicodeMap *uMap;
+  // get the output encoding
+  if (!(uMap = globalParams->getTextEncoding())) {
+    return ret;
+  }
+
+  getSelectionWordLimits(selection, style, 1, &first, &last, &first_c, &last_c);
 
-  visitSelection(&dumper, selection, style);
+  for (int i=first; i<=last; i++) {
+    word = wordlist->get(i);
+    if (prevword) {
+      if (word->secondaryCmp(prevword)) {
+        ret->append('\n');
+      } else {
+        ret->append(' ');
+      }
+    }
+    if (i == first && i == last) {
+      dumpFragment(word->text + first_c, last_c - first_c, uMap, ret);
+    } else if (i == first) {
+      dumpFragment(word->text + first_c, word->len - first_c, uMap, ret);
+    } else if (i == last) {
+      dumpFragment(word->text, last_c, uMap, ret);
+    } else {
+      dumpFragment(word->text, word->len, uMap, ret);
+    }
+    prevword = word;
+  }
+  delete wordlist;
 
-  return dumper.getText();
+  return ret;
 }
 
 GBool TextPage::findCharRange(int pos, int length,
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 438aee4..27d0d6a 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -125,6 +125,7 @@ public:
   // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
   // based on a primary-axis comparison, e.g., x ordering if rot=0.
   int primaryCmp(TextWord *word);
+  int secondaryCmp(TextWord *word);
 
   // Return the distance along the primary axis between <this> and
   // <word>.
@@ -581,6 +582,13 @@ private:
   void clear();
   void assignColumns(TextLineFrag *frags, int nFrags, GBool rot);
   int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
+  void getSelectionWordLimits(PDFRectangle *selection,
+                              SelectionStyle style,
+                              double scale,
+                              int *first,
+                              int *last,
+                              int *first_c,
+                              int *last_c);
 
   GBool rawOrder;		// keep text in content stream order
 
-- 
1.7.2.2.169.gb5442




More information about the poppler mailing list