[PATCH] Select text in raworder.

Daniel Garcia danigm at yaco.es
Wed Sep 15 03:56:50 PDT 2010


---
 poppler/TextOutputDev.cc |  388 ++++++++++++++++++++++++++++++++++++++++++++--
 poppler/TextOutputDev.h  |   20 +++
 2 files changed, 397 insertions(+), 11 deletions(-)

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 576bcc9..179a15c 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -416,6 +416,27 @@ inline int TextWord::primaryCmp(TextWord *word) {
   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
 }
 
+inline int TextWord::secondaryCmp(TextWord *word) {
+  double cmp;
+
+  cmp = 0; // make gcc happy
+  switch (rot) {
+  case 0:
+    cmp = yMin - word->yMin;
+    break;
+  case 1:
+    cmp = xMin - word->xMin;
+    break;
+  case 2:
+    cmp = word->yMax - yMax;
+    break;
+  case 3:
+    cmp = word->xMax - xMax;
+    break;
+  }
+  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
 double TextWord::primaryDelta(TextWord *word) {
   double delta;
 
@@ -1860,15 +1881,28 @@ TextWordList::TextWordList(TextPage *text, GBool physLayout) {
   TextFlow *flow;
   TextBlock *blk;
   TextLine *line;
-  TextWord *word;
+  TextWord *word, *prevword=NULL;
   TextWord **wordArray;
   int nWords, i;
 
   words = new GooList();
 
   if (text->rawOrder) {
-    for (word = text->rawWords; word; word = word->next) {
-      words->append(word);
+    if (text->primaryLR) {
+      for (word = text->rawWords; word; word = word->next) {
+        words->append(word);
+      }
+    } else {
+      i = 0;
+      for (word = text->rawWords; word; word = word->next) {
+        if (prevword) {
+          if (word->secondaryCmp(prevword)) {
+            i = getLength();
+          }
+        }
+        words->insert(i, word);
+        prevword = word;
+      }
     }
 
   } else if (physLayout) {
@@ -2361,6 +2395,24 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
   if (rawOrder) {
     primaryRot = 0;
     primaryLR = gTrue;
+
+    // determine the primary direction
+    lrCount = 0;
+    TextWordList *wordlist = makeWordList(gFalse);
+    if (wordlist->getLength()) {
+      for (word0 = wordlist->get(0); word0; word0 = word0->next) {
+        for (i = 0; i < word0->len; ++i) {
+          if (unicodeTypeL(word0->text[i])) {
+            ++lrCount;
+          } else if (unicodeTypeR(word0->text[i])) {
+            --lrCount;
+          }
+        }
+      }
+      primaryLR = lrCount >= 0;
+    }
+    delete wordlist;
+
     return;
   }
 
@@ -4105,6 +4157,7 @@ public:
 			  PDFRectangle *selection);
   virtual void visitWord (TextWord *word, int begin, int end,
 			  PDFRectangle *selection);
+  void drawRegion (PDFRectangle *region);
 
 private:
   OutputDev *out;
@@ -4184,6 +4237,21 @@ void TextSelectionPainter::visitLine (TextLine *line,
   state->clearPath();
 }
 
+void TextSelectionPainter::drawRegion (PDFRectangle *region)
+{
+  state->setFillColor(box_color);
+  out->updateFillColor(state);
+
+  state->moveTo(region->x1, region->y1);
+  state->lineTo(region->x2, region->y1);
+  state->lineTo(region->x2, region->y2);
+  state->lineTo(region->x1, region->y2);
+  state->closePath();
+
+  out->fill(state);
+  state->clearPath();
+}
+
 void TextSelectionPainter::visitWord (TextWord *word, int begin, int end,
 				      PDFRectangle *selection)
 {
@@ -4543,6 +4611,105 @@ void TextPage::visitSelection(TextSelectionVisitor *visitor,
   }
 }
 
+void TextPage::getSelectionWordLimits(PDFRectangle *selection,
+                                      SelectionStyle style,
+                                      double scale,
+                                      int *first,
+                                      int *last,
+                                      int *first_c,
+                                      int *last_c) {
+  TextWordList *wordlist = makeWordList(gFalse);
+  TextWord *word=NULL;
+  double distance, minor=-1, minor1=-1;
+  double xmin, ymin, xmax, ymax;
+  double x1, y1, x2, y2;
+  int tmp;
+
+  x1 = selection->x1;
+  x2 = selection->x2;
+
+  y1 = selection->y1;
+  y2 = selection->y2;
+
+  for (int i=0; i<wordlist->getLength(); i++) {
+    word = wordlist->get(i);
+
+    for (int j=0; j<word->getLength(); j++) {
+      word->getCharBBox(j, &xmin, &ymin, &xmax, &ymax);
+
+      distance = fabs(x1 - xmin) + 10*fabs(y1 - ymin);
+      if (minor < 0 || distance < minor) {
+        *first = i;
+        *first_c = j;
+        minor = distance;
+      }
+
+      distance = fabs(x1 - xmin) + 10*fabs(y1 - ymax);
+      if (minor < 0 || distance < minor) {
+        *first = i;
+        *first_c = j;
+        minor = distance;
+      }
+
+      distance = fabs(x2 - xmax) + 10*fabs(y2 - ymax);
+      if (minor1 < 0 || distance < minor1) {
+        *last = i;
+        *last_c = j;
+        minor1 = distance;
+      }
+    }
+  }
+
+  switch (style) {
+    case selectionStyleGlyph:
+	break;
+    case selectionStyleLine:
+      for (int i=*first; i>=0; i--) {
+        word = wordlist->get(i);
+	if (!word->secondaryCmp(wordlist->get(*first))) {
+	  *first = i;
+	}
+      }
+      for (int i=*last; i<wordlist->getLength(); i++) {
+        word = wordlist->get(i);
+	if (!word->secondaryCmp(wordlist->get(*last))) {
+	  *last = i;
+	}
+      }
+    case selectionStyleWord:
+      *first_c = wordlist->get(*first)->getLength() - 1;
+      if (primaryLR) {
+        *last_c = wordlist->get(*last)->getLength() - 1;
+      } else {
+        *last_c = 0;
+      }
+      if (last == first) {
+        *last_c = wordlist->get(*last)->getLength() - 1;
+        *first_c = 0;
+      }
+      break;
+    default: break;
+  }
+
+  if (*first > *last) {
+    tmp = *last;
+    *last = *first;
+    *first = tmp;
+
+    tmp = *last_c;
+    *last_c = *first_c;
+    *first_c = tmp;
+  }
+
+  if (*first == *last && *first_c > *last_c) {
+    tmp = *last_c;
+    *last_c = *first_c;
+    *first_c = tmp;
+  }
+
+  delete wordlist;
+}
+
 void TextPage::drawSelection(OutputDev *out,
 			     double scale,
 			     int rotation,
@@ -4550,30 +4717,229 @@ void TextPage::drawSelection(OutputDev *out,
 			     SelectionStyle style,
 			     GfxColor *glyph_color, GfxColor *box_color)
 {
+  if (!rawOrder) {
+    TextSelectionPainter painter(this, scale, rotation,
+			         out, box_color, glyph_color);
+    visitSelection(&painter, selection, style);
+  } else {
+    drawSelectionRaw(out, scale, rotation, selection, style, glyph_color, box_color);
+  }
+}
+
+void TextPage::drawSelectionRaw(OutputDev *out,
+                                double scale,
+                                int rotation,
+                                PDFRectangle *selection,
+                                SelectionStyle style,
+                                GfxColor *glyph_color,
+                                GfxColor *box_color)
+{
   TextSelectionPainter painter(this, scale, rotation, 
-			       out, box_color, glyph_color);
+                               out, box_color, glyph_color);
+  int first, last, first_c, last_c, begin, end;
+  TextWordList *wordlist = makeWordList(gFalse);
+  TextWord *word = NULL;
+  PDFRectangle *rect;
+  GooList *rlist;
 
-  visitSelection(&painter, selection, style);
+  getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c);
+  rlist = getSelectionRegion(selection, style, scale);
+  for(int i=0; i<rlist->getLength(); i++) {
+    rect = (PDFRectangle *)rlist->get(i);
+    painter.drawRegion(rect);
+  }
+
+  for(int i=first; i<=last; i++) {
+    word = wordlist->get(i);
+    if (primaryLR) {
+      if (i == first && i == last) {
+        begin = first_c;
+        end = last_c + 1;
+      } else if (i == first) {
+        begin = first_c;
+        end = word->getLength();
+      } else if (i == last) {
+        begin = 0;
+        end = last_c + 1;
+      } else {
+        begin = 0;
+        end = word->getLength();
+      }
+    } else {
+      if (i == first && i == last) {
+        begin = first_c;
+        end = last_c + 1;
+      } else if (i == first) {
+        begin = 0;
+        end = first_c + 1;
+      } else if (i == last) {
+        begin = last_c;
+        end = word->getLength();
+      } else {
+        begin = 0;
+        end = word->getLength();
+      }
+    }
+
+    painter.visitWord(word, begin, end, selection);
+  }
+
+  delete wordlist;
 }
 
 GooList *TextPage::getSelectionRegion(PDFRectangle *selection,
 				      SelectionStyle style,
 				      double scale) {
-  TextSelectionSizer sizer(this, scale);
+  if (!rawOrder) {
+    TextSelectionSizer sizer(this, scale);
+    visitSelection(&sizer, selection, style);
+    return sizer.getRegion();
+  } else {
+    return getSelectionRegionRaw(selection, style, scale);
+  }
+}
+
+GooList *TextPage::getSelectionRegionRaw(PDFRectangle *selection,
+                                         SelectionStyle style,
+                                         double scale)
+{
+  GooList *ret = new GooList();
+  PDFRectangle *rect = NULL;
+  TextWordList *wordlist = makeWordList(gFalse);
+  TextWord *word=NULL, *prevword=NULL;
+  int first=0, last=0, first_c=0, last_c=0;
+  double xmin, ymin, xmax, ymax;
+  double xmin1, ymin1, xmax1, ymax1;
+
+  getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c);
+
+  for (int i=first; i<=last; i++) {
+    word = wordlist->get(i);
+    if (prevword && !word->secondaryCmp(prevword) && rect) {
+      if (i == last) {
+        word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax);
+      }
+      else {
+        word->getBBox(&xmin, &ymin, &xmax, &ymax);
+      }
 
-  visitSelection(&sizer, selection, style);
+      if (primaryLR) {
+        rect->x2 = xmax;
+      } else {
+        rect->x1 = xmin;
+      }
+      prevword = word;
+      continue;
+    }
 
-  return sizer.getRegion();
+    if (primaryLR) {
+      if (i == first && i == last) {
+        word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+        word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax);
+        xmin = xmin1; ymin = ymin1;
+      } else if (i == first) {
+        word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+        word->getBBox(&xmin, &ymin, &xmax, &ymax);
+        xmin = xmin1; ymin = ymin1;
+      } else if (i == last) {
+        word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1);
+        word->getBBox(&xmin, &ymin, &xmax, &ymax);
+        xmax = xmax1; ymax = ymax1;
+      } else {
+        word->getBBox(&xmin, &ymin, &xmax, &ymax);
+      }
+    } else {
+      if (i == first && i == last) {
+        word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+        word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax);
+        xmin = xmin1; ymin = ymin1;
+      } else if (i == first) {
+        word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+        word->getBBox(&xmin, &ymin, &xmax, &ymax);
+        xmax = xmax1; ymax = ymax1;
+      } else if (i == last) {
+        word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1);
+        word->getBBox(&xmin, &ymin, &xmax, &ymax);
+        xmin = xmin1; ymin = ymin1;
+      } else {
+        word->getBBox(&xmin, &ymin, &xmax, &ymax);
+      }
+    }
+
+    rect = new PDFRectangle(xmin, ymin, xmax, ymax);
+    ret->append(rect);
+    prevword = word;
+  }
+
+  delete wordlist;
+
+  return ret;
 }
 
 GooString *TextPage::getSelectionText(PDFRectangle *selection,
 				      SelectionStyle style)
 {
-  TextSelectionDumper dumper(this);
+  if (!rawOrder) {
+    TextSelectionDumper dumper(this);
+    visitSelection(&dumper, selection, style);
+    return dumper.getText();
+  } else {
+    return getSelectionTextRaw(selection, style);
+  }
+}
+
+
+GooString *TextPage::getSelectionTextRaw(PDFRectangle *selection,
+                                         SelectionStyle style)
+{
+  GooString *ret = new GooString();
+  TextWordList *wordlist = makeWordList(gFalse);
+  TextWord *word=NULL, *prevword=NULL;
+  int first=0, last=0, first_c=0, last_c=0;
+  UnicodeMap *uMap;
+  // get the output encoding
+  if (!(uMap = globalParams->getTextEncoding())) {
+    return ret;
+  }
+
+  getSelectionWordLimits(selection, style, 1, &first, &last, &first_c, &last_c);
+
+  for (int i=first; i<=last; i++) {
+    word = wordlist->get(i);
+    if (prevword) {
+      if (word->secondaryCmp(prevword)) {
+        ret->append('\n');
+      } else {
+        ret->append(' ');
+      }
+    }
+    if (primaryLR) {
+      if (i == first && i == last) {
+        dumpFragment(word->text + first_c, last_c+1 - first_c, uMap, ret);
+      } else if (i == first) {
+        dumpFragment(word->text + first_c, word->len - first_c, uMap, ret);
+      } else if (i == last) {
+        dumpFragment(word->text, last_c+1, uMap, ret);
+      } else {
+        dumpFragment(word->text, word->len, uMap, ret);
+      }
+    } else {
+      if (i == first && i == last) {
+        dumpFragment(word->text + first_c, last_c+1 - first_c, uMap, ret);
+      } else if (i == first) {
+        dumpFragment(word->text, first_c+1, uMap, ret);
+      } else if (i == last) {
+        dumpFragment(word->text + last_c, word->len, uMap, ret);
+      } else {
+        dumpFragment(word->text, word->len, uMap, ret);
+      }
+    }
 
-  visitSelection(&dumper, selection, style);
+    prevword = word;
+  }
+  delete wordlist;
 
-  return dumper.getText();
+  return ret;
 }
 
 GBool TextPage::findCharRange(int pos, int length,
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 438aee4..4b9edd4 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -125,6 +125,7 @@ public:
   // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
   // based on a primary-axis comparison, e.g., x ordering if rot=0.
   int primaryCmp(TextWord *word);
+  int secondaryCmp(TextWord *word);
 
   // Return the distance along the primary axis between <this> and
   // <word>.
@@ -581,6 +582,25 @@ private:
   void clear();
   void assignColumns(TextLineFrag *frags, int nFrags, GBool rot);
   int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
+  void getSelectionWordLimits(PDFRectangle *selection,
+                              SelectionStyle style,
+                              double scale,
+                              int *first,
+                              int *last,
+                              int *first_c,
+                              int *last_c);
+  GooString *getSelectionTextRaw(PDFRectangle *selection,
+                                 SelectionStyle style);
+  GooList *getSelectionRegionRaw(PDFRectangle *selection,
+                                 SelectionStyle style,
+                                 double scale);
+  void drawSelectionRaw(OutputDev *out,
+                        double scale,
+                        int rotation,
+                        PDFRectangle *selection,
+                        SelectionStyle style,
+                        GfxColor *glyph_color,
+                        GfxColor *box_color);
 
   GBool rawOrder;		// keep text in content stream order
 
-- 
1.7.0.4


--=-6F/d4B6EosiMJ6QrM9ha--



More information about the poppler mailing list