[poppler] 4 commits - glib/poppler-page.cc poppler/TextOutputDev.cc poppler/TextOutputDev.h
Carlos Garcia Campos
carlosgc at kemper.freedesktop.org
Tue Jun 25 02:48:00 PDT 2013
glib/poppler-page.cc | 168 ++++++++++++++++--------------
poppler/TextOutputDev.cc | 261 +++++++++++++++++++++++++----------------------
poppler/TextOutputDev.h | 4
3 files changed, 235 insertions(+), 198 deletions(-)
New commits:
commit c55b577ce69ad4bb69f5261b3e120e92c9fdb3d0
Author: Carlos Garcia Campos <carlosgc at gnome.org>
Date: Tue Jun 25 10:01:38 2013 +0200
glib: Use TextPage::getSelectionWords to build text layout and attributes
This way we can make sure that the list of words used in
poppler_page_get_text_layout and poppler_page_get_text_attributes is the
same that the one used in poppler_page_get_text. This fixes the mismatch
between the number of characters in the text returned by
poppler_page_get_text and the number of characters returned by
poppler_page_get_text_layout in some documents.
diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 631edb5..9115b78 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -1979,66 +1979,66 @@ poppler_page_get_text_layout (PopplerPage *page,
guint *n_rectangles)
{
TextPage *text;
- TextWordList *wordlist;
- TextWord *word, *nextword;
PopplerRectangle *rect;
- int i, j;
+ PDFRectangle selection;
+ int i, j, k;
guint offset = 0;
guint n_rects = 0;
gdouble x1, y1, x2, y2;
gdouble x3, y3, x4, y4;
+ GooList **word_list;
+ int n_lines;
g_return_val_if_fail (POPPLER_IS_PAGE (page), FALSE);
*n_rectangles = 0;
+ poppler_page_get_size (page, &selection.x2, &selection.y2);
text = poppler_page_get_text_page (page);
- wordlist = text->makeWordList (gFalse);
+ word_list = text->getSelectionWords (&selection, selectionStyleGlyph, &n_lines);
+ if (!word_list)
+ return FALSE;
- if (wordlist->getLength () <= 0)
+ n_rects += n_lines - 1;
+ for (i = 0; i < n_lines; i++)
{
- delete wordlist;
- return FALSE;
- }
-
- // Getting the array size
- for (i = 0; i < wordlist->getLength (); i++)
- {
- word = wordlist->get (i);
- n_rects += word->getLength ();
- if (!word->getNext () || word->getSpaceAfter ())
- n_rects++;
+ GooList *line_words = word_list[i];
+ n_rects += line_words->getLength() - 1;
+ for (j = 0; j < line_words->getLength(); j++)
+ {
+ TextWord *word = (TextWord *)line_words->get(j);
+ n_rects += word->getLength();
+ }
}
- n_rects--;
- *n_rectangles = n_rects;
*rectangles = g_new (PopplerRectangle, n_rects);
+ *n_rectangles = n_rects;
- // Calculating each char position
- for (i = 0; i < wordlist->getLength (); i++)
+ for (i = 0; i < n_lines; i++)
{
- word = wordlist->get (i);
- for (j = 0; j < word->getLength (); j++)
+ GooList *line_words = word_list[i];
+ for (j = 0; j < line_words->getLength(); j++)
{
+ TextWord *word = (TextWord *)line_words->get(j);
+ for (k = 0; k < word->getLength(); k++)
+ {
+ rect = *rectangles + offset;
+ word->getCharBBox (k,
+ &(rect->x1),
+ &(rect->y1),
+ &(rect->x2),
+ &(rect->y2));
+ offset++;
+ }
+
rect = *rectangles + offset;
- word->getCharBBox (j,
- &(rect->x1),
- &(rect->y1),
- &(rect->x2),
- &(rect->y2));
- offset++;
- }
+ word->getBBox (&x1, &y1, &x2, &y2);
- // adding spaces and break lines
- rect = *rectangles + offset;
- word->getBBox (&x1, &y1, &x2, &y2);
+ if (j < line_words->getLength() - 1)
+ {
+ TextWord *next_word = (TextWord *)line_words->get(j + 1);
- nextword = word->getNext ();
- if (nextword)
- {
- if (word->getSpaceAfter ())
- {
- nextword->getBBox (&x3, &y3, &x4, &y4);
+ next_word->getBBox(&x3, &y3, &x4, &y4);
// space is from one word to other and with the same height as
// first word.
rect->x1 = x2;
@@ -2046,20 +2046,23 @@ poppler_page_get_text_layout (PopplerPage *page,
rect->x2 = x3;
rect->y2 = y2;
offset++;
- }
- }
- else if (offset < n_rects)
+ }
+ }
+
+ if (i < n_lines - 1 && offset > 0)
{
- // end of line
- rect->x1 = x2;
- rect->y1 = y2;
- rect->x2 = x2;
- rect->y2 = y2;
- offset++;
- }
+ // end of line
+ rect->x1 = x2;
+ rect->y1 = y2;
+ rect->x2 = x2;
+ rect->y2 = y2;
+ offset++;
+ }
+
+ delete line_words;
}
- delete wordlist;
+ gfree (word_list);
return TRUE;
}
@@ -2122,53 +2125,62 @@ GList *
poppler_page_get_text_attributes (PopplerPage *page)
{
TextPage *text;
- TextWordList *wordlist;
+ PDFRectangle selection;
+ GooList **word_list;
+ int n_lines;
PopplerTextAttributes *attrs = NULL;
- gint i, offset = 0;
+ TextWord *word, *prev_word = NULL;
+ gint word_i, prev_word_i;
+ gint i, j;
+ gint offset = 0;
GList *attributes = NULL;
g_return_val_if_fail (POPPLER_IS_PAGE (page), NULL);
+ poppler_page_get_size (page, &selection.x2, &selection.y2);
text = poppler_page_get_text_page (page);
- wordlist = text->makeWordList (gFalse);
+ word_list = text->getSelectionWords (&selection, selectionStyleGlyph, &n_lines);
+ if (!word_list)
+ return NULL;
- if (wordlist->getLength () <= 0)
+ for (i = 0; i < n_lines; i++)
{
- delete wordlist;
- return NULL;
- }
-
- TextWord *word, *prev_word = NULL;
- gint word_i, prev_word_i;
+ GooList *line_words = word_list[i];
+ for (j = 0; j < line_words->getLength(); j++)
+ {
+ word = (TextWord *)line_words->get(j);
- // Calculating each word attributes
- for (i = 0; i < wordlist->getLength (); i++)
- {
- word = wordlist->get (i);
+ for (word_i = 0; word_i < word->getLength (); word_i++)
+ {
+ if (!prev_word || !word_text_attributes_equal (word, word_i, prev_word, prev_word_i))
+ {
+ attrs = poppler_text_attributes_new_from_word (word, word_i);
+ attrs->start_index = offset;
+ attributes = g_list_prepend (attributes, attrs);
+ }
+ attrs->end_index = offset;
+ offset++;
+ prev_word = word;
+ prev_word_i = word_i;
+ }
- for (word_i = 0; word_i < word->getLength (); word_i++)
- {
- if (!prev_word || !word_text_attributes_equal (word, word_i, prev_word, prev_word_i))
+ if (j < line_words->getLength() - 1)
{
- attrs = poppler_text_attributes_new_from_word (word, word_i);
- attrs->start_index = offset;
- attributes = g_list_prepend (attributes, attrs);
+ attrs->end_index = offset;
+ offset++;
}
- attrs->end_index = offset;
- offset++;
- prev_word = word;
- prev_word_i = word_i;
- }
- if (!word->getNext () || word->getSpaceAfter ())
+ }
+
+ if (i < n_lines - 1)
{
attrs->end_index = offset;
offset++;
}
+
+ delete line_words;
}
- if (attrs)
- attrs->end_index--;
- delete wordlist;
+ gfree (word_list);
return g_list_reverse(attributes);
}
commit fc534f571315c064005515c19d7d70ad3af1563e
Author: Carlos Garcia Campos <carlosgc at gnome.org>
Date: Tue Jun 25 10:05:01 2013 +0200
TextOutputDev: add a method to TextPage to get the selection as a list of words
Returns a list of lines of words.
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 4ef5963..928e95a 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -4043,6 +4043,7 @@ public:
void endPage();
GooString *getText(void);
+ GooList **getWordList(int *nLines);
private:
@@ -4177,6 +4178,29 @@ GooString *TextSelectionDumper::getText (void)
return text;
}
+GooList **TextSelectionDumper::getWordList(int *nLinesOut)
+{
+ int i, j;
+
+ if (nLines == 0)
+ return NULL;
+
+ GooList **wordList = (GooList **)gmallocn(nLines, sizeof(GooList *));
+
+ for (i = 0; i < nLines; i++) {
+ GooList *lineWords = lines[i];
+ wordList[i] = new GooList();
+ for (j = 0; j < lineWords->getLength(); j++) {
+ TextWordSelection *sel = (TextWordSelection *)lineWords->get(j);
+ wordList[i]->append(sel->word);
+ }
+ }
+
+ *nLinesOut = nLines;
+
+ return wordList;
+}
+
class TextSelectionSizer : public TextSelectionVisitor {
public:
TextSelectionSizer(TextPage *page, double scale);
@@ -4751,6 +4775,18 @@ GooString *TextPage::getSelectionText(PDFRectangle *selection,
return dumper.getText();
}
+GooList **TextPage::getSelectionWords(PDFRectangle *selection,
+ SelectionStyle style,
+ int *nLines)
+{
+ TextSelectionDumper dumper(this);
+
+ visitSelection(&dumper, selection, style);
+ dumper.endPage();
+
+ return dumper.getWordList(nLines);
+}
+
GBool TextPage::findCharRange(int pos, int length,
double *xMin, double *yMin,
double *xMax, double *yMax) {
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 664f9d1..6269f1c 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -563,6 +563,10 @@ public:
GooString *getSelectionText(PDFRectangle *selection,
SelectionStyle style);
+ GooList **getSelectionWords(PDFRectangle *selection,
+ SelectionStyle style,
+ int *nLines);
+
// Find a string by character position and length. If found, sets
// the text bounding rectangle and returns true; otherwise returns
// false.
commit a924246b7534e86165f8e9ab6c60d56b73a17b94
Author: Carlos Garcia Campos <carlosgc at gnome.org>
Date: Tue Jun 25 09:57:48 2013 +0200
TextOutputDev: simplify the text selection dumper
Build a list of lines of words and don't try to format the text when
detecting tables, simply add the words and lines in the right order.
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 2872f02..4ef5963 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -4039,26 +4039,55 @@ public:
int edge_end,
PDFRectangle *selection);
virtual void visitWord (TextWord *word, int begin, int end,
- PDFRectangle *selection) { };
+ PDFRectangle *selection);
+ void endPage();
GooString *getText(void);
private:
- TextLineFrag *frags;
- int nFrags, fragsSize;
+
+ void startLine();
+ void finishLine();
+
+ GooList **lines;
+ int nLines, linesSize;
+ GooList *words;
+ int tableId;
+ TextBlock *currentBlock;
};
TextSelectionDumper::TextSelectionDumper(TextPage *page)
: TextSelectionVisitor(page)
{
- fragsSize = 256;
- frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
- nFrags = 0;
+ linesSize = 256;
+ lines = (GooList **)gmallocn(linesSize, sizeof(GooList *));
+ nLines = 0;
+
+ tableId = -1;
+ currentBlock = NULL;
+ words = NULL;
}
TextSelectionDumper::~TextSelectionDumper()
{
- gfree(frags);
+ for (int i = 0; i < nLines; i++)
+ deleteGooList(lines[i], TextWordSelection);
+ gfree(lines);
+}
+
+void TextSelectionDumper::startLine()
+{
+ finishLine();
+ words = new GooList();
+}
+
+void TextSelectionDumper::finishLine()
+{
+ if (words && words->getLength() > 0)
+ lines[nLines++] = words;
+ else if (words)
+ delete words;
+ words = NULL;
}
void TextSelectionDumper::visitLine (TextLine *line,
@@ -4068,130 +4097,84 @@ void TextSelectionDumper::visitLine (TextLine *line,
int edge_end,
PDFRectangle *selection)
{
- if (nFrags == fragsSize) {
- fragsSize *= 2;
- frags = (TextLineFrag *) grealloc(frags, fragsSize * sizeof(TextLineFrag));
+ TextLineFrag frag;
+
+ if (nLines == linesSize) {
+ linesSize *= 2;
+ lines = (GooList **)grealloc(lines, linesSize * sizeof(GooList *));
+ }
+
+ frag.init(line, edge_begin, edge_end - edge_begin);
+
+ if (tableId >= 0 && frag.line->blk->tableId < 0) {
+ finishLine();
+
+ tableId = -1;
+ currentBlock = NULL;
}
- frags[nFrags].init(line, edge_begin, edge_end - edge_begin);
- ++nFrags;
+ if (frag.line->blk->tableId >= 0) { // a table
+ if (tableId == -1) {
+ tableId = frag.line->blk->tableId;
+ currentBlock = frag.line->blk;
+ }
+
+ if (currentBlock == frag.line->blk) { // the same block
+ startLine();
+ } else { // another block
+ if (currentBlock->tableEnd) { // previous block ended its row
+ startLine();
+ }
+ currentBlock = frag.line->blk;
+ }
+ } else { // not a table
+ startLine();
+ }
+}
+void TextSelectionDumper::visitWord (TextWord *word, int begin, int end,
+ PDFRectangle *selection)
+{
+ words->append(new TextWordSelection(word, begin, end));
+}
+
+void TextSelectionDumper::endPage()
+{
+ finishLine();
}
GooString *TextSelectionDumper::getText (void)
{
- GooString *s;
- TextLineFrag *frag;
+ GooString *text;
int i, j;
UnicodeMap *uMap;
char space[8], eol[16];
int spaceLen, eolLen;
- GooList *strings = NULL;
- int actual_table = -1;
- int actual_line = -1;
- int last_length = 0;
- TextBlock *actual_block = NULL;
-
- s = new GooString();
- uMap = globalParams->getTextEncoding();
+ text = new GooString();
- if (uMap == NULL)
- return s;
+ if (!(uMap = globalParams->getTextEncoding()))
+ return text;
spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
- if (nFrags > 0) {
- for (i = 0; i < nFrags; ++i) {
- frag = &frags[i];
-
- if (actual_table >= 0 && frag->line->blk->tableId < 0) {
- for (j = 0; j < strings->getLength (); j++) {
- s->append ((GooString*) strings->get (j));
- s->append (eol, eolLen);
- delete ((GooString*) strings->get (j));
- }
- delete strings;
- strings = NULL;
- actual_table = -1;
- actual_line = -1;
- actual_block = NULL;
- }
-
- // a table
- if (frag->line->blk->tableId >= 0) {
- if (actual_table == -1) {
- strings = new GooList();
- actual_table = frag->line->blk->tableId;
- actual_block = frag->line->blk;
- actual_line = -1;
- }
-
- // the same block
- if (actual_block == frag->line->blk) {
- actual_line++;
- if (actual_line >= strings->getLength ()) {
- GooString *t = new GooString ();
- // add some spaces to have this block correctly aligned
- if (actual_line > 0)
- for (j = 0; j < ((GooString*) (strings->get (actual_line - 1)))->getLength() - last_length - 1; j++)
- t->append (space, spaceLen);
- strings->append (t);
- }
- }
- // another block
- else {
- // previous block ended its row
- if (actual_block->tableEnd) {
- for (j = 0; j < strings->getLength (); j++) {
- s->append ((GooString*) strings->get (j));
- s->append (eol, eolLen);
- delete ((GooString*) strings->get (j));
- }
- delete strings;
-
- strings = new GooList();
- GooString *t = new GooString ();
- strings->append (t);
- }
- actual_block = frag->line->blk;
- actual_line = 0;
- }
-
- page->dumpFragment(frag->line->text + frag->start, frag->len, uMap, ((GooString*) strings->get (actual_line)));
- last_length = frag->len;
+ for (i = 0; i < nLines; i++) {
+ GooList *lineWords = lines[i];
+ for (j = 0; j < lineWords->getLength(); j++) {
+ TextWordSelection *sel = (TextWordSelection *)lineWords->get(j);
- if (!frag->line->blk->tableEnd) {
- ((GooString*) strings->get (actual_line))->append (space, spaceLen);
- }
- }
- // not a table
- else {
- page->dumpFragment (frag->line->text + frag->start, frag->len, uMap, s);
- if (i < nFrags - 1) {
- s->append (eol, eolLen);
- }
- }
- }
-
- if (strings != NULL) {
- for (j = 0; j < strings->getLength (); j++) {
- s->append((GooString*) strings->get (j));
- s->append(eol, eolLen);
- delete ((GooString*) strings->get (j));
- }
- delete strings;
- strings = NULL;
- actual_table = -1;
- actual_line = -1;
- actual_block = NULL;
+ page->dumpFragment (sel->word->text + sel->begin, sel->end - sel->begin, uMap, text);
+ if (j < lineWords->getLength() - 1)
+ text->append(space, spaceLen);
}
+ if (i < nLines - 1)
+ text->append(eol, eolLen);
}
uMap->decRefCnt();
- return s;
+ return text;
}
class TextSelectionSizer : public TextSelectionVisitor {
@@ -4763,6 +4746,7 @@ GooString *TextPage::getSelectionText(PDFRectangle *selection,
TextSelectionDumper dumper(this);
visitSelection(&dumper, selection, style);
+ dumper.endPage();
return dumper.getText();
}
commit c849094a2daf896d085937adff1f7659a09da062
Author: Carlos Garcia Campos <carlosgc at gnome.org>
Date: Mon Jun 24 18:29:11 2013 +0200
TextOutputDev: Move TextSelection class from TextSelectionPainter to TextSelectionVisitor
So that it can be used by other TextSelectionVisitor implementations.
Also renamed it as TextWordSelection since it contains a word selection.
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index fe051f6..2872f02 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -3999,6 +3999,21 @@ public:
PDFRectangle *selection) = 0;
protected:
+
+ class TextWordSelection {
+ public:
+ TextWordSelection(TextWord *word, int begin, int end)
+ : word(word),
+ begin(begin),
+ end(end)
+ {
+ }
+
+ TextWord *word;
+ int begin;
+ int end;
+ };
+
TextPage *page;
};
@@ -4265,20 +4280,6 @@ private:
GfxState *state;
GooList *selectionList;
Matrix ctm, ictm;
-
- class TextSelection {
- public:
- TextSelection(TextWord *word, int begin, int end)
- : word(word),
- begin(begin),
- end(end)
- {
- }
-
- TextWord *word;
- int begin;
- int end;
- };
};
TextSelectionPainter::TextSelectionPainter(TextPage *page,
@@ -4310,7 +4311,7 @@ TextSelectionPainter::TextSelectionPainter(TextPage *page,
TextSelectionPainter::~TextSelectionPainter()
{
- deleteGooList(selectionList, TextSelection);
+ deleteGooList(selectionList, TextWordSelection);
delete state;
}
@@ -4350,7 +4351,7 @@ void TextSelectionPainter::visitLine (TextLine *line,
void TextSelectionPainter::visitWord (TextWord *word, int begin, int end,
PDFRectangle *selection)
{
- selectionList->append(new TextSelection(word, begin, end));
+ selectionList->append(new TextWordSelection(word, begin, end));
}
void TextSelectionPainter::endPage()
@@ -4362,7 +4363,7 @@ void TextSelectionPainter::endPage()
out->updateFillColor(state);
for (int i = 0; i < selectionList->getLength(); i++) {
- TextSelection *sel = (TextSelection *) selectionList->get(i);
+ TextWordSelection *sel = (TextWordSelection *) selectionList->get(i);
int begin = sel->begin;
while (begin < sel->end) {
More information about the poppler
mailing list