[poppler] poppler/Makefile.am poppler/MarkedContentOutputDev.cc poppler/MarkedContentOutputDev.h poppler/StructElement.cc poppler/StructElement.h
Carlos Garcia Campos
carlosgc at kemper.freedesktop.org
Thu Dec 5 08:43:02 PST 2013
poppler/Makefile.am | 2
poppler/MarkedContentOutputDev.cc | 210 ++++++++++++++++++++++++++++++++++++++
poppler/MarkedContentOutputDev.h | 128 +++++++++++++++++++++++
poppler/StructElement.cc | 50 +++++++++
poppler/StructElement.h | 28 +++++
5 files changed, 418 insertions(+)
New commits:
commit 17b2623360ed8917e94a8e5b880e92e6db70335e
Author: Adrian Perez de Castro <aperez at igalia.com>
Date: Tue Jun 18 00:35:51 2013 +0300
Tagged-PDF: Text content extraction from structure elements
Implement StructElement::getText(), by using MCOutputDev. This output device
captures pieces of text (aka "spans") which have the same attributes into
a list of TextSpan objects.
https://bugs.freedesktop.org/show_bug.cgi?id=64815
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 9f90c9d..5f0c795 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -232,6 +232,7 @@ poppler_include_HEADERS = \
NameToUnicodeTable.h \
PSOutputDev.h \
TextOutputDev.h \
+ MarkedContentOutputDev.h \
SecurityHandler.h \
UTF.h \
UTF8.h \
@@ -306,6 +307,7 @@ libpoppler_la_SOURCES = \
XRef.cc \
PSOutputDev.cc \
TextOutputDev.cc \
+ MarkedContentOutputDev.cc \
PageLabelInfo.h \
PageLabelInfo.cc \
SecurityHandler.cc \
diff --git a/poppler/MarkedContentOutputDev.cc b/poppler/MarkedContentOutputDev.cc
new file mode 100644
index 0000000..78f2ea7
--- /dev/null
+++ b/poppler/MarkedContentOutputDev.cc
@@ -0,0 +1,210 @@
+//========================================================================
+//
+// MarkedContentOutputDev.cc
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#include "MarkedContentOutputDev.h"
+#include "GlobalParams.h"
+#include "UnicodeMap.h"
+#include "GfxState.h"
+#include "GfxFont.h"
+#include "Annot.h"
+#include <vector>
+
+
+MarkedContentOutputDev::MarkedContentOutputDev(int mcidA):
+ currentFont(NULL),
+ currentText(NULL),
+ mcid(mcidA),
+ pageWidth(0.0),
+ pageHeight(0.0),
+ unicodeMap(NULL)
+{
+ currentColor.r = currentColor.g = currentColor.b = 0;
+}
+
+
+MarkedContentOutputDev::~MarkedContentOutputDev()
+{
+ if (unicodeMap)
+ unicodeMap->decRefCnt();
+ if (currentFont)
+ currentFont->decRefCnt();
+ delete currentText;
+}
+
+
+void MarkedContentOutputDev::endSpan()
+{
+ if (currentText && currentText->getLength()) {
+ // The TextSpan takes ownership of currentText and
+ // increases the reference count for currentFont.
+ textSpans.push_back(TextSpan(currentText,
+ currentFont,
+ currentColor));
+ }
+ currentText = NULL;
+}
+
+
+void MarkedContentOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
+{
+ if (state) {
+ pageWidth = state->getPageWidth();
+ pageHeight = state->getPageHeight();
+ } else {
+ pageWidth = pageHeight = 0.0;
+ }
+}
+
+
+void MarkedContentOutputDev::endPage()
+{
+ pageWidth = pageHeight = 0.0;
+}
+
+
+void MarkedContentOutputDev::beginMarkedContent(char *name, Dict *properties)
+{
+ int id = -1;
+ if (properties)
+ properties->lookupInt("MCID", NULL, &id);
+
+ if (id == -1)
+ return;
+
+ // The stack keep track of MCIDs of nested marked content.
+ if (inMarkedContent() || id == mcid)
+ mcidStack.push_back(id);
+}
+
+
+void MarkedContentOutputDev::endMarkedContent(GfxState *state)
+{
+ if (inMarkedContent()) {
+ mcidStack.pop_back();
+ // The outer marked content sequence MCID was popped, ensure
+ // that the last piece of text collected ends up in a TextSpan.
+ if (!inMarkedContent())
+ endSpan();
+ }
+}
+
+
+bool MarkedContentOutputDev::needFontChange(GfxFont* font) const
+{
+ if (currentFont == font)
+ return gFalse;
+
+ if (!currentFont)
+ return font != NULL && font->isOk();
+
+ if (font == NULL)
+ return gTrue;
+
+ // Two non-null valid fonts are the same if they point to the same Ref
+ if (currentFont->getID()->num == font->getID()->num &&
+ currentFont->getID()->gen == font->getID()->gen)
+ return gFalse;
+
+ return gTrue;
+}
+
+
+void MarkedContentOutputDev::drawChar(GfxState *state,
+ double xx, double yy,
+ double dx, double dy,
+ double ox, double oy,
+ CharCode c, int nBytes,
+ Unicode *u, int uLen)
+{
+ if (!inMarkedContent() || !uLen)
+ return;
+
+
+ // Color changes are tracked here so the color can be chosen depending on
+ // the render mode (for mode 1 stroke color is used), so there is no need
+ // to implement both updateFillColor() and updateStrokeColor().
+ GBool colorChange = gFalse;
+ GfxRGB color;
+ if ((state->getRender() & 3) == 1)
+ state->getStrokeRGB(&color);
+ else
+ state->getFillRGB(&color);
+
+ colorChange = (color.r != currentColor.r ||
+ color.g != currentColor.g ||
+ color.b != currentColor.b);
+
+ // Check also for font changes.
+ GBool fontChange = needFontChange(state->getFont());
+
+ // Save a span with the current changes.
+ if (colorChange || fontChange) {
+ endSpan();
+ }
+
+ // Perform the color/font changes.
+ if (colorChange)
+ currentColor = color;
+
+ if (fontChange) {
+ if (currentFont != NULL) {
+ currentFont->decRefCnt();
+ currentFont = NULL;
+ }
+ if (state->getFont() != NULL) {
+ currentFont = state->getFont();
+ currentFont->incRefCnt();
+ }
+ }
+
+
+ double sp, dx2, dy2, w1, h1, x1, y1;
+
+ // Subtract char and word spacing from the (dx,dy) values
+ sp = state->getCharSpace();
+ if (c == (CharCode) 0x20)
+ sp += state->getWordSpace();
+ state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
+ dx -= dx2;
+ dy -= dy2;
+ state->transformDelta(dx, dy, &w1, &h1);
+ state->transform(xx, yy, &x1, &y1);
+
+ // Throw away characters that are not inside the page boundaries.
+ if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight)
+ return;
+
+ // Make a sanity check on character size. Note: (x != x) <-> isnan(x)
+ if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1)
+ return;
+
+ for (int i = 0; i < uLen; i++) {
+ // Soft hyphen markers are skipped, as they are invisible unless
+ // rendering is done to an actual device and the hyphenation hint
+ // used. MarkedContentOutputDev extracts the *visible* text content.
+ if (u[i] != 0x00AD) {
+ // Add the UTF-8 sequence to the current text span.
+ if (!unicodeMap)
+ unicodeMap = globalParams->getTextEncoding();
+
+ char buf[8];
+ int n = unicodeMap->mapUnicode(u[i], buf, sizeof(buf));
+ if (n > 0) {
+ if (currentText == NULL)
+ currentText = new GooString();
+ currentText->append(buf, n);
+ }
+ }
+ }
+}
+
+
+const TextSpanArray& MarkedContentOutputDev::getTextSpans() const
+{
+ return textSpans;
+}
diff --git a/poppler/MarkedContentOutputDev.h b/poppler/MarkedContentOutputDev.h
new file mode 100644
index 0000000..6decc9b
--- /dev/null
+++ b/poppler/MarkedContentOutputDev.h
@@ -0,0 +1,128 @@
+//========================================================================
+//
+// MarkedContentOutputDev.h
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifndef MARKEDCONTENTOUTPUTDEV_H
+#define MARKEDCONTENTOUTPUTDEV_H
+
+#include "goo/gtypes.h"
+#include "goo/gmem.h"
+#include "OutputDev.h"
+#include "GfxState.h"
+#include "GfxFont.h"
+#include <vector>
+
+class Dict;
+class UnicodeMap;
+
+
+class TextSpan {
+public:
+ TextSpan(const TextSpan& other): data(other.data) {
+ data->refcount++;
+ }
+
+ TextSpan& operator=(const TextSpan& other) {
+ if (this != &other) {
+ data = other.data;
+ data->refcount++;
+ }
+ return *this;
+ }
+
+ ~TextSpan() {
+ if (data && --data->refcount == 0)
+ delete data;
+ }
+
+ GfxFont* getFont() const { return data->font; }
+ GooString* getText() const { return data->text; }
+ GfxRGB& getColor() const { return data->color; }
+
+private:
+ // Note: Takes ownership of strings, increases refcount for font.
+ TextSpan(GooString *text,
+ GfxFont *font,
+ const GfxRGB& color)
+ : data(new Data) {
+ data->text = text;
+ data->font = font;
+ data->color = color;
+ if (data->font)
+ data->font->incRefCnt();
+ }
+
+ struct Data {
+ GfxFont *font;
+ GooString *text;
+ GfxRGB color;
+ unsigned refcount;
+
+ Data(): refcount(1) {}
+
+ ~Data() {
+ assert(refcount == 0);
+ if (font)
+ font->decRefCnt();
+ delete text;
+ }
+ };
+
+ Data *data;
+
+ friend class MarkedContentOutputDev;
+};
+
+
+typedef std::vector<TextSpan> TextSpanArray;
+
+
+class MarkedContentOutputDev: public OutputDev {
+public:
+ MarkedContentOutputDev(int mcidA);
+ virtual ~MarkedContentOutputDev();
+
+ virtual GBool isOk() { return gTrue; }
+ virtual GBool upsideDown() { return gTrue; }
+ virtual GBool useDrawChar() { return gTrue; }
+ virtual GBool interpretType3Chars() { return gFalse; }
+ virtual GBool needNonText() { return gFalse; }
+ virtual GBool needCharCount() { return gFalse; }
+
+ virtual void startPage(int pageNum, GfxState *state, XRef *xref);
+ virtual void endPage();
+
+ virtual void drawChar(GfxState *state,
+ double xx, double yy,
+ double dx, double dy,
+ double ox, double oy,
+ CharCode c, int nBytes,
+ Unicode *u, int uLen);
+
+ virtual void beginMarkedContent(char *name, Dict *properties);
+ virtual void endMarkedContent(GfxState *state);
+
+ const TextSpanArray& getTextSpans() const;
+
+private:
+
+ void endSpan();
+ bool inMarkedContent() const { return mcidStack.size() > 0; }
+ bool needFontChange(GfxFont* font) const;
+
+ GfxFont *currentFont;
+ GooString *currentText;
+ GfxRGB currentColor;
+ TextSpanArray textSpans;
+ int mcid;
+ std::vector<int> mcidStack;
+ double pageWidth;
+ double pageHeight;
+ UnicodeMap *unicodeMap;
+};
+
+#endif /* !MARKEDCONTENTOUTPUTDEV_H */
diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc
index 7d893df..3a98658 100644
--- a/poppler/StructElement.cc
+++ b/poppler/StructElement.cc
@@ -14,6 +14,8 @@
#include "StructElement.h"
#include "StructTreeRoot.h"
+#include "GlobalParams.h"
+#include "UnicodeMap.h"
#include "PDFDoc.h"
#include "Dict.h"
@@ -981,6 +983,54 @@ const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBo
return NULL;
}
+GooString* StructElement::appendSubTreeText(GooString *string, GBool recursive) const
+{
+ if (isContent() && !isObjectRef()) {
+ MarkedContentOutputDev mcdev(getMCID());
+ const TextSpanArray& spans(getTextSpansInternal(mcdev));
+
+ if (!string)
+ string = new GooString();
+
+ for (TextSpanArray::const_iterator i = spans.begin(); i != spans.end(); ++i)
+ string->append(i->getText());
+
+ return string;
+ }
+
+ if (!recursive)
+ return NULL;
+
+ // Do a depth-first traversal, to get elements in logical order
+ if (!string)
+ string = new GooString();
+
+ for (unsigned i = 0; i < getNumElements(); i++)
+ getElement(i)->appendSubTreeText(string, recursive);
+
+ return string;
+}
+
+const TextSpanArray& StructElement::getTextSpansInternal(MarkedContentOutputDev& mcdev) const
+{
+ assert(isContent());
+
+ int startPage = 0, endPage = 0;
+
+ Ref ref;
+ if (getPageRef(ref)) {
+ startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen);
+ }
+
+ if (!(startPage && endPage)) {
+ startPage = 1;
+ endPage = treeRoot->getDoc()->getNumPages();
+ }
+
+ treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse);
+ return mcdev.getTextSpans();
+}
+
static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved)
{
// Circular reference
diff --git a/poppler/StructElement.h b/poppler/StructElement.h
index 00deef4..b9eef8a 100644
--- a/poppler/StructElement.h
+++ b/poppler/StructElement.h
@@ -17,6 +17,7 @@
#include "goo/gtypes.h"
#include "goo/GooString.h"
+#include "MarkedContentOutputDev.h"
#include "Object.h"
#include <vector>
#include <set>
@@ -218,9 +219,36 @@ public:
const GooString *getActualText() const { return isContent() ? NULL : s->actualText; }
GooString *getActualText() { return isContent() ? NULL : s->actualText; }
+ // Content text referenced by the element:
+ //
+ // - For MCID reference elements, this is just the text of the
+ // corresponding marked content object in the page stream, regardless
+ // of the setting of the "recursive" flag.
+ // - For other elements, if the "recursive" flag is set, the text
+ // enclosed by *all* the child MCID reference elements of the subtree
+ // is returned. The text is assembled by traversing the leaf MCID
+ // reference elements in logical order.
+ // - In any other case, the function returns NULL.
+ //
+ // A new string is returned, and the ownership passed to the caller.
+ //
+ GooString *getText(GBool recursive = gTrue) const {
+ return appendSubTreeText(NULL, recursive);
+ }
+
+ const TextSpanArray getTextSpans() const {
+ if (!isContent())
+ return TextSpanArray();
+ MarkedContentOutputDev mcdev(getMCID());
+ return getTextSpansInternal(mcdev);
+ }
+
~StructElement();
private:
+ GooString* appendSubTreeText(GooString *string, GBool recursive) const;
+ const TextSpanArray& getTextSpansInternal(MarkedContentOutputDev& mcdev) const;
+
typedef std::vector<Attribute*> AttrPtrArray;
typedef std::vector<StructElement*> ElemPtrArray;
More information about the poppler
mailing list