[PATCH] Select text in raworder.
Daniel Garcia
danigm at yaco.es
Wed Sep 15 03:56:50 PDT 2010
---
poppler/TextOutputDev.cc | 388 ++++++++++++++++++++++++++++++++++++++++++++--
poppler/TextOutputDev.h | 20 +++
2 files changed, 397 insertions(+), 11 deletions(-)
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 576bcc9..179a15c 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -416,6 +416,27 @@ inline int TextWord::primaryCmp(TextWord *word) {
return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
+inline int TextWord::secondaryCmp(TextWord *word) {
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ cmp = yMin - word->yMin;
+ break;
+ case 1:
+ cmp = xMin - word->xMin;
+ break;
+ case 2:
+ cmp = word->yMax - yMax;
+ break;
+ case 3:
+ cmp = word->xMax - xMax;
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
double TextWord::primaryDelta(TextWord *word) {
double delta;
@@ -1860,15 +1881,28 @@ TextWordList::TextWordList(TextPage *text, GBool physLayout) {
TextFlow *flow;
TextBlock *blk;
TextLine *line;
- TextWord *word;
+ TextWord *word, *prevword=NULL;
TextWord **wordArray;
int nWords, i;
words = new GooList();
if (text->rawOrder) {
- for (word = text->rawWords; word; word = word->next) {
- words->append(word);
+ if (text->primaryLR) {
+ for (word = text->rawWords; word; word = word->next) {
+ words->append(word);
+ }
+ } else {
+ i = 0;
+ for (word = text->rawWords; word; word = word->next) {
+ if (prevword) {
+ if (word->secondaryCmp(prevword)) {
+ i = getLength();
+ }
+ }
+ words->insert(i, word);
+ prevword = word;
+ }
}
} else if (physLayout) {
@@ -2361,6 +2395,24 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
if (rawOrder) {
primaryRot = 0;
primaryLR = gTrue;
+
+ // determine the primary direction
+ lrCount = 0;
+ TextWordList *wordlist = makeWordList(gFalse);
+ if (wordlist->getLength()) {
+ for (word0 = wordlist->get(0); word0; word0 = word0->next) {
+ for (i = 0; i < word0->len; ++i) {
+ if (unicodeTypeL(word0->text[i])) {
+ ++lrCount;
+ } else if (unicodeTypeR(word0->text[i])) {
+ --lrCount;
+ }
+ }
+ }
+ primaryLR = lrCount >= 0;
+ }
+ delete wordlist;
+
return;
}
@@ -4105,6 +4157,7 @@ public:
PDFRectangle *selection);
virtual void visitWord (TextWord *word, int begin, int end,
PDFRectangle *selection);
+ void drawRegion (PDFRectangle *region);
private:
OutputDev *out;
@@ -4184,6 +4237,21 @@ void TextSelectionPainter::visitLine (TextLine *line,
state->clearPath();
}
+void TextSelectionPainter::drawRegion (PDFRectangle *region)
+{
+ state->setFillColor(box_color);
+ out->updateFillColor(state);
+
+ state->moveTo(region->x1, region->y1);
+ state->lineTo(region->x2, region->y1);
+ state->lineTo(region->x2, region->y2);
+ state->lineTo(region->x1, region->y2);
+ state->closePath();
+
+ out->fill(state);
+ state->clearPath();
+}
+
void TextSelectionPainter::visitWord (TextWord *word, int begin, int end,
PDFRectangle *selection)
{
@@ -4543,6 +4611,105 @@ void TextPage::visitSelection(TextSelectionVisitor *visitor,
}
}
+void TextPage::getSelectionWordLimits(PDFRectangle *selection,
+ SelectionStyle style,
+ double scale,
+ int *first,
+ int *last,
+ int *first_c,
+ int *last_c) {
+ TextWordList *wordlist = makeWordList(gFalse);
+ TextWord *word=NULL;
+ double distance, minor=-1, minor1=-1;
+ double xmin, ymin, xmax, ymax;
+ double x1, y1, x2, y2;
+ int tmp;
+
+ x1 = selection->x1;
+ x2 = selection->x2;
+
+ y1 = selection->y1;
+ y2 = selection->y2;
+
+ for (int i=0; i<wordlist->getLength(); i++) {
+ word = wordlist->get(i);
+
+ for (int j=0; j<word->getLength(); j++) {
+ word->getCharBBox(j, &xmin, &ymin, &xmax, &ymax);
+
+ distance = fabs(x1 - xmin) + 10*fabs(y1 - ymin);
+ if (minor < 0 || distance < minor) {
+ *first = i;
+ *first_c = j;
+ minor = distance;
+ }
+
+ distance = fabs(x1 - xmin) + 10*fabs(y1 - ymax);
+ if (minor < 0 || distance < minor) {
+ *first = i;
+ *first_c = j;
+ minor = distance;
+ }
+
+ distance = fabs(x2 - xmax) + 10*fabs(y2 - ymax);
+ if (minor1 < 0 || distance < minor1) {
+ *last = i;
+ *last_c = j;
+ minor1 = distance;
+ }
+ }
+ }
+
+ switch (style) {
+ case selectionStyleGlyph:
+ break;
+ case selectionStyleLine:
+ for (int i=*first; i>=0; i--) {
+ word = wordlist->get(i);
+ if (!word->secondaryCmp(wordlist->get(*first))) {
+ *first = i;
+ }
+ }
+ for (int i=*last; i<wordlist->getLength(); i++) {
+ word = wordlist->get(i);
+ if (!word->secondaryCmp(wordlist->get(*last))) {
+ *last = i;
+ }
+ }
+ case selectionStyleWord:
+ *first_c = wordlist->get(*first)->getLength() - 1;
+ if (primaryLR) {
+ *last_c = wordlist->get(*last)->getLength() - 1;
+ } else {
+ *last_c = 0;
+ }
+ if (last == first) {
+ *last_c = wordlist->get(*last)->getLength() - 1;
+ *first_c = 0;
+ }
+ break;
+ default: break;
+ }
+
+ if (*first > *last) {
+ tmp = *last;
+ *last = *first;
+ *first = tmp;
+
+ tmp = *last_c;
+ *last_c = *first_c;
+ *first_c = tmp;
+ }
+
+ if (*first == *last && *first_c > *last_c) {
+ tmp = *last_c;
+ *last_c = *first_c;
+ *first_c = tmp;
+ }
+
+ delete wordlist;
+}
+
void TextPage::drawSelection(OutputDev *out,
double scale,
int rotation,
@@ -4550,30 +4717,229 @@ void TextPage::drawSelection(OutputDev *out,
SelectionStyle style,
GfxColor *glyph_color, GfxColor *box_color)
{
+ if (!rawOrder) {
+ TextSelectionPainter painter(this, scale, rotation,
+ out, box_color, glyph_color);
+ visitSelection(&painter, selection, style);
+ } else {
+ drawSelectionRaw(out, scale, rotation, selection, style, glyph_color, box_color);
+ }
+}
+
+void TextPage::drawSelectionRaw(OutputDev *out,
+ double scale,
+ int rotation,
+ PDFRectangle *selection,
+ SelectionStyle style,
+ GfxColor *glyph_color,
+ GfxColor *box_color)
+{
TextSelectionPainter painter(this, scale, rotation,
- out, box_color, glyph_color);
+ out, box_color, glyph_color);
+ int first, last, first_c, last_c, begin, end;
+ TextWordList *wordlist = makeWordList(gFalse);
+ TextWord *word = NULL;
+ PDFRectangle *rect;
+ GooList *rlist;
- visitSelection(&painter, selection, style);
+ getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c);
+ rlist = getSelectionRegion(selection, style, scale);
+ for(int i=0; i<rlist->getLength(); i++) {
+ rect = (PDFRectangle *)rlist->get(i);
+ painter.drawRegion(rect);
+ }
+
+ for(int i=first; i<=last; i++) {
+ word = wordlist->get(i);
+ if (primaryLR) {
+ if (i == first && i == last) {
+ begin = first_c;
+ end = last_c + 1;
+ } else if (i == first) {
+ begin = first_c;
+ end = word->getLength();
+ } else if (i == last) {
+ begin = 0;
+ end = last_c + 1;
+ } else {
+ begin = 0;
+ end = word->getLength();
+ }
+ } else {
+ if (i == first && i == last) {
+ begin = first_c;
+ end = last_c + 1;
+ } else if (i == first) {
+ begin = 0;
+ end = first_c + 1;
+ } else if (i == last) {
+ begin = last_c;
+ end = word->getLength();
+ } else {
+ begin = 0;
+ end = word->getLength();
+ }
+ }
+
+ painter.visitWord(word, begin, end, selection);
+ }
+
+ delete wordlist;
}
GooList *TextPage::getSelectionRegion(PDFRectangle *selection,
SelectionStyle style,
double scale) {
- TextSelectionSizer sizer(this, scale);
+ if (!rawOrder) {
+ TextSelectionSizer sizer(this, scale);
+ visitSelection(&sizer, selection, style);
+ return sizer.getRegion();
+ } else {
+ return getSelectionRegionRaw(selection, style, scale);
+ }
+}
+
+GooList *TextPage::getSelectionRegionRaw(PDFRectangle *selection,
+ SelectionStyle style,
+ double scale)
+{
+ GooList *ret = new GooList();
+ PDFRectangle *rect = NULL;
+ TextWordList *wordlist = makeWordList(gFalse);
+ TextWord *word=NULL, *prevword=NULL;
+ int first=0, last=0, first_c=0, last_c=0;
+ double xmin, ymin, xmax, ymax;
+ double xmin1, ymin1, xmax1, ymax1;
+
+ getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c);
+
+ for (int i=first; i<=last; i++) {
+ word = wordlist->get(i);
+ if (prevword && !word->secondaryCmp(prevword) && rect) {
+ if (i == last) {
+ word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax);
+ }
+ else {
+ word->getBBox(&xmin, &ymin, &xmax, &ymax);
+ }
- visitSelection(&sizer, selection, style);
+ if (primaryLR) {
+ rect->x2 = xmax;
+ } else {
+ rect->x1 = xmin;
+ }
+ prevword = word;
+ continue;
+ }
- return sizer.getRegion();
+ if (primaryLR) {
+ if (i == first && i == last) {
+ word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+ word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax);
+ xmin = xmin1; ymin = ymin1;
+ } else if (i == first) {
+ word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+ word->getBBox(&xmin, &ymin, &xmax, &ymax);
+ xmin = xmin1; ymin = ymin1;
+ } else if (i == last) {
+ word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1);
+ word->getBBox(&xmin, &ymin, &xmax, &ymax);
+ xmax = xmax1; ymax = ymax1;
+ } else {
+ word->getBBox(&xmin, &ymin, &xmax, &ymax);
+ }
+ } else {
+ if (i == first && i == last) {
+ word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+ word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax);
+ xmin = xmin1; ymin = ymin1;
+ } else if (i == first) {
+ word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1);
+ word->getBBox(&xmin, &ymin, &xmax, &ymax);
+ xmax = xmax1; ymax = ymax1;
+ } else if (i == last) {
+ word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1);
+ word->getBBox(&xmin, &ymin, &xmax, &ymax);
+ xmin = xmin1; ymin = ymin1;
+ } else {
+ word->getBBox(&xmin, &ymin, &xmax, &ymax);
+ }
+ }
+
+ rect = new PDFRectangle(xmin, ymin, xmax, ymax);
+ ret->append(rect);
+ prevword = word;
+ }
+
+ delete wordlist;
+
+ return ret;
}
GooString *TextPage::getSelectionText(PDFRectangle *selection,
SelectionStyle style)
{
- TextSelectionDumper dumper(this);
+ if (!rawOrder) {
+ TextSelectionDumper dumper(this);
+ visitSelection(&dumper, selection, style);
+ return dumper.getText();
+ } else {
+ return getSelectionTextRaw(selection, style);
+ }
+}
+
+
+GooString *TextPage::getSelectionTextRaw(PDFRectangle *selection,
+ SelectionStyle style)
+{
+ GooString *ret = new GooString();
+ TextWordList *wordlist = makeWordList(gFalse);
+ TextWord *word=NULL, *prevword=NULL;
+ int first=0, last=0, first_c=0, last_c=0;
+ UnicodeMap *uMap;
+ // get the output encoding
+ if (!(uMap = globalParams->getTextEncoding())) {
+ return ret;
+ }
+
+ getSelectionWordLimits(selection, style, 1, &first, &last, &first_c, &last_c);
+
+ for (int i=first; i<=last; i++) {
+ word = wordlist->get(i);
+ if (prevword) {
+ if (word->secondaryCmp(prevword)) {
+ ret->append('\n');
+ } else {
+ ret->append(' ');
+ }
+ }
+ if (primaryLR) {
+ if (i == first && i == last) {
+ dumpFragment(word->text + first_c, last_c+1 - first_c, uMap, ret);
+ } else if (i == first) {
+ dumpFragment(word->text + first_c, word->len - first_c, uMap, ret);
+ } else if (i == last) {
+ dumpFragment(word->text, last_c+1, uMap, ret);
+ } else {
+ dumpFragment(word->text, word->len, uMap, ret);
+ }
+ } else {
+ if (i == first && i == last) {
+ dumpFragment(word->text + first_c, last_c+1 - first_c, uMap, ret);
+ } else if (i == first) {
+ dumpFragment(word->text, first_c+1, uMap, ret);
+ } else if (i == last) {
+ dumpFragment(word->text + last_c, word->len, uMap, ret);
+ } else {
+ dumpFragment(word->text, word->len, uMap, ret);
+ }
+ }
- visitSelection(&dumper, selection, style);
+ prevword = word;
+ }
+ delete wordlist;
- return dumper.getText();
+ return ret;
}
GBool TextPage::findCharRange(int pos, int length,
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 438aee4..4b9edd4 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -125,6 +125,7 @@ public:
// Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
// based on a primary-axis comparison, e.g., x ordering if rot=0.
int primaryCmp(TextWord *word);
+ int secondaryCmp(TextWord *word);
// Return the distance along the primary axis between <this> and
// <word>.
@@ -581,6 +582,25 @@ private:
void clear();
void assignColumns(TextLineFrag *frags, int nFrags, GBool rot);
int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
+ void getSelectionWordLimits(PDFRectangle *selection,
+ SelectionStyle style,
+ double scale,
+ int *first,
+ int *last,
+ int *first_c,
+ int *last_c);
+ GooString *getSelectionTextRaw(PDFRectangle *selection,
+ SelectionStyle style);
+ GooList *getSelectionRegionRaw(PDFRectangle *selection,
+ SelectionStyle style,
+ double scale);
+ void drawSelectionRaw(OutputDev *out,
+ double scale,
+ int rotation,
+ PDFRectangle *selection,
+ SelectionStyle style,
+ GfxColor *glyph_color,
+ GfxColor *box_color);
GBool rawOrder; // keep text in content stream order
--
1.7.0.4
--=-6F/d4B6EosiMJ6QrM9ha--
More information about the poppler
mailing list