[poppler] poppler/Makefile.am poppler/MarkedContentOutputDev.cc poppler/MarkedContentOutputDev.h poppler/StructElement.cc poppler/StructElement.h

Carlos Garcia Campos carlosgc at kemper.freedesktop.org
Thu Dec 5 08:43:02 PST 2013


 poppler/Makefile.am               |    2 
 poppler/MarkedContentOutputDev.cc |  210 ++++++++++++++++++++++++++++++++++++++
 poppler/MarkedContentOutputDev.h  |  128 +++++++++++++++++++++++
 poppler/StructElement.cc          |   50 +++++++++
 poppler/StructElement.h           |   28 +++++
 5 files changed, 418 insertions(+)

New commits:
commit 17b2623360ed8917e94a8e5b880e92e6db70335e
Author: Adrian Perez de Castro <aperez at igalia.com>
Date:   Tue Jun 18 00:35:51 2013 +0300

    Tagged-PDF: Text content extraction from structure elements
    
    Implement StructElement::getText(), by using MCOutputDev. This output device
    captures pieces of text (aka "spans") which have the same attributes into
    a list of TextSpan objects.
    
    https://bugs.freedesktop.org/show_bug.cgi?id=64815

diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 9f90c9d..5f0c795 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -232,6 +232,7 @@ poppler_include_HEADERS =	\
 	NameToUnicodeTable.h	\
 	PSOutputDev.h		\
 	TextOutputDev.h		\
+	MarkedContentOutputDev.h \
 	SecurityHandler.h	\
 	UTF.h			\
 	UTF8.h			\
@@ -306,6 +307,7 @@ libpoppler_la_SOURCES =		\
 	XRef.cc			\
 	PSOutputDev.cc		\
 	TextOutputDev.cc	\
+	MarkedContentOutputDev.cc \
 	PageLabelInfo.h		\
 	PageLabelInfo.cc	\
 	SecurityHandler.cc	\
diff --git a/poppler/MarkedContentOutputDev.cc b/poppler/MarkedContentOutputDev.cc
new file mode 100644
index 0000000..78f2ea7
--- /dev/null
+++ b/poppler/MarkedContentOutputDev.cc
@@ -0,0 +1,210 @@
+//========================================================================
+//
+// MarkedContentOutputDev.cc
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#include "MarkedContentOutputDev.h"
+#include "GlobalParams.h"
+#include "UnicodeMap.h"
+#include "GfxState.h"
+#include "GfxFont.h"
+#include "Annot.h"
+#include <vector>
+
+
+MarkedContentOutputDev::MarkedContentOutputDev(int mcidA):
+  currentFont(NULL),
+  currentText(NULL),
+  mcid(mcidA),
+  pageWidth(0.0),
+  pageHeight(0.0),
+  unicodeMap(NULL)
+{
+  currentColor.r = currentColor.g = currentColor.b = 0;
+}
+
+
+MarkedContentOutputDev::~MarkedContentOutputDev()
+{
+  if (unicodeMap)
+    unicodeMap->decRefCnt();
+  if (currentFont)
+    currentFont->decRefCnt();
+  delete currentText;
+}
+
+
+void MarkedContentOutputDev::endSpan()
+{
+  if (currentText && currentText->getLength()) {
+    // The TextSpan takes ownership of currentText and
+    // increases the reference count for currentFont.
+    textSpans.push_back(TextSpan(currentText,
+                                 currentFont,
+                                 currentColor));
+  }
+  currentText = NULL;
+}
+
+
+void MarkedContentOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
+{
+  if (state) {
+    pageWidth  = state->getPageWidth();
+    pageHeight = state->getPageHeight();
+  } else {
+    pageWidth = pageHeight = 0.0;
+  }
+}
+
+
+void MarkedContentOutputDev::endPage()
+{
+  pageWidth = pageHeight = 0.0;
+}
+
+
+void MarkedContentOutputDev::beginMarkedContent(char *name, Dict *properties)
+{
+  int id = -1;
+  if (properties)
+    properties->lookupInt("MCID", NULL, &id);
+
+  if (id == -1)
+    return;
+
+  // The stack keep track of MCIDs of nested marked content.
+  if (inMarkedContent() || id == mcid)
+    mcidStack.push_back(id);
+}
+
+
+void MarkedContentOutputDev::endMarkedContent(GfxState *state)
+{
+  if (inMarkedContent()) {
+      mcidStack.pop_back();
+      // The outer marked content sequence MCID was popped, ensure
+      // that the last piece of text collected ends up in a TextSpan.
+      if (!inMarkedContent())
+        endSpan();
+  }
+}
+
+
+bool MarkedContentOutputDev::needFontChange(GfxFont* font) const
+{
+  if (currentFont == font)
+    return gFalse;
+
+  if (!currentFont)
+    return font != NULL && font->isOk();
+
+  if (font == NULL)
+    return gTrue;
+
+  // Two non-null valid fonts are the same if they point to the same Ref
+  if (currentFont->getID()->num == font->getID()->num &&
+      currentFont->getID()->gen == font->getID()->gen)
+    return gFalse;
+
+  return gTrue;
+}
+
+
+void MarkedContentOutputDev::drawChar(GfxState *state,
+                                      double xx, double yy,
+                                      double dx, double dy,
+                                      double ox, double oy,
+                                      CharCode c, int nBytes,
+                                      Unicode *u, int uLen)
+{
+  if (!inMarkedContent() || !uLen)
+    return;
+
+
+  // Color changes are tracked here so the color can be chosen depending on
+  // the render mode (for mode 1 stroke color is used), so there is no need
+  // to implement both updateFillColor() and updateStrokeColor().
+  GBool colorChange = gFalse;
+  GfxRGB color;
+  if ((state->getRender() & 3) == 1)
+    state->getStrokeRGB(&color);
+  else
+    state->getFillRGB(&color);
+
+  colorChange = (color.r != currentColor.r ||
+                 color.g != currentColor.g ||
+                 color.b != currentColor.b);
+
+  // Check also for font changes.
+  GBool fontChange = needFontChange(state->getFont());
+
+  // Save a span with the current changes.
+  if (colorChange || fontChange) {
+    endSpan();
+  }
+
+  // Perform the color/font changes.
+  if (colorChange)
+    currentColor = color;
+
+  if (fontChange) {
+    if (currentFont != NULL) {
+      currentFont->decRefCnt();
+      currentFont = NULL;
+    }
+    if (state->getFont() != NULL) {
+      currentFont = state->getFont();
+      currentFont->incRefCnt();
+    }
+  }
+
+
+  double sp, dx2, dy2, w1, h1, x1, y1;
+
+  // Subtract char and word spacing from the (dx,dy) values
+  sp = state->getCharSpace();
+  if (c == (CharCode) 0x20)
+    sp += state->getWordSpace();
+  state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
+  dx -= dx2;
+  dy -= dy2;
+  state->transformDelta(dx, dy, &w1, &h1);
+  state->transform(xx, yy, &x1, &y1);
+
+  // Throw away characters that are not inside the page boundaries.
+  if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight)
+    return;
+
+  // Make a sanity check on character size. Note: (x != x) <-> isnan(x)
+  if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1)
+    return;
+
+  for (int i = 0; i < uLen; i++) {
+    // Soft hyphen markers are skipped, as they are invisible unless
+    // rendering is done to an actual device and the hyphenation hint
+    // used. MarkedContentOutputDev extracts the *visible* text content.
+    if (u[i] != 0x00AD) {
+      // Add the UTF-8 sequence to the current text span.
+      if (!unicodeMap)
+        unicodeMap = globalParams->getTextEncoding();
+
+      char buf[8];
+      int n = unicodeMap->mapUnicode(u[i], buf, sizeof(buf));
+      if (n > 0) {
+        if (currentText == NULL)
+          currentText = new GooString();
+        currentText->append(buf, n);
+      }
+    }
+  }
+}
+
+
+const TextSpanArray& MarkedContentOutputDev::getTextSpans() const
+{
+  return textSpans;
+}
diff --git a/poppler/MarkedContentOutputDev.h b/poppler/MarkedContentOutputDev.h
new file mode 100644
index 0000000..6decc9b
--- /dev/null
+++ b/poppler/MarkedContentOutputDev.h
@@ -0,0 +1,128 @@
+//========================================================================
+//
+// MarkedContentOutputDev.h
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifndef MARKEDCONTENTOUTPUTDEV_H
+#define MARKEDCONTENTOUTPUTDEV_H
+
+#include "goo/gtypes.h"
+#include "goo/gmem.h"
+#include "OutputDev.h"
+#include "GfxState.h"
+#include "GfxFont.h"
+#include <vector>
+
+class Dict;
+class UnicodeMap;
+
+
+class TextSpan {
+public:
+  TextSpan(const TextSpan& other): data(other.data) {
+    data->refcount++;
+  }
+
+  TextSpan& operator=(const TextSpan& other) {
+    if (this != &other) {
+      data = other.data;
+      data->refcount++;
+    }
+    return *this;
+  }
+
+  ~TextSpan() {
+    if (data && --data->refcount == 0)
+      delete data;
+  }
+
+  GfxFont* getFont() const { return data->font; }
+  GooString* getText() const { return data->text; }
+  GfxRGB& getColor() const { return data->color; }
+
+private:
+  // Note: Takes ownership of strings, increases refcount for font.
+  TextSpan(GooString *text,
+           GfxFont *font,
+           const GfxRGB& color)
+      : data(new Data) {
+    data->text = text;
+    data->font = font;
+    data->color = color;
+    if (data->font)
+      data->font->incRefCnt();
+  }
+
+  struct Data {
+    GfxFont   *font;
+    GooString *text;
+    GfxRGB     color;
+    unsigned refcount;
+
+    Data(): refcount(1) {}
+
+    ~Data() {
+      assert(refcount == 0);
+      if (font)
+        font->decRefCnt();
+      delete text;
+    }
+  };
+
+  Data *data;
+
+  friend class MarkedContentOutputDev;
+};
+
+
+typedef std::vector<TextSpan> TextSpanArray;
+
+
+class MarkedContentOutputDev: public OutputDev {
+public:
+  MarkedContentOutputDev(int mcidA);
+  virtual ~MarkedContentOutputDev();
+
+  virtual GBool isOk() { return gTrue; }
+  virtual GBool upsideDown() { return gTrue; }
+  virtual GBool useDrawChar() { return gTrue; }
+  virtual GBool interpretType3Chars() { return gFalse; }
+  virtual GBool needNonText() { return gFalse; }
+  virtual GBool needCharCount() { return gFalse; }
+
+  virtual void startPage(int pageNum, GfxState *state, XRef *xref);
+  virtual void endPage();
+
+  virtual void drawChar(GfxState *state,
+                        double xx, double yy,
+                        double dx, double dy,
+                        double ox, double oy,
+                        CharCode c, int nBytes,
+                        Unicode *u, int uLen);
+
+  virtual void beginMarkedContent(char *name, Dict *properties);
+  virtual void endMarkedContent(GfxState *state);
+
+  const TextSpanArray& getTextSpans() const;
+
+private:
+
+  void endSpan();
+  bool inMarkedContent() const { return mcidStack.size() > 0; }
+  bool needFontChange(GfxFont* font) const;
+
+  GfxFont         *currentFont;
+  GooString       *currentText;
+  GfxRGB           currentColor;
+  TextSpanArray    textSpans;
+  int              mcid;
+  std::vector<int> mcidStack;
+  double           pageWidth;
+  double           pageHeight;
+  UnicodeMap      *unicodeMap;
+};
+
+#endif /* !MARKEDCONTENTOUTPUTDEV_H */
diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc
index 7d893df..3a98658 100644
--- a/poppler/StructElement.cc
+++ b/poppler/StructElement.cc
@@ -14,6 +14,8 @@
 
 #include "StructElement.h"
 #include "StructTreeRoot.h"
+#include "GlobalParams.h"
+#include "UnicodeMap.h"
 #include "PDFDoc.h"
 #include "Dict.h"
 
@@ -981,6 +983,54 @@ const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBo
   return NULL;
 }
 
+GooString* StructElement::appendSubTreeText(GooString *string, GBool recursive) const
+{
+  if (isContent() && !isObjectRef()) {
+    MarkedContentOutputDev mcdev(getMCID());
+    const TextSpanArray& spans(getTextSpansInternal(mcdev));
+
+    if (!string)
+      string = new GooString();
+
+    for (TextSpanArray::const_iterator i = spans.begin(); i != spans.end(); ++i)
+      string->append(i->getText());
+
+    return string;
+  }
+
+  if (!recursive)
+    return NULL;
+
+  // Do a depth-first traversal, to get elements in logical order
+  if (!string)
+    string = new GooString();
+
+  for (unsigned i = 0; i < getNumElements(); i++)
+    getElement(i)->appendSubTreeText(string, recursive);
+
+  return string;
+}
+
+const TextSpanArray& StructElement::getTextSpansInternal(MarkedContentOutputDev& mcdev) const
+{
+  assert(isContent());
+
+  int startPage = 0, endPage = 0;
+
+  Ref ref;
+  if (getPageRef(ref)) {
+    startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen);
+  }
+
+  if (!(startPage && endPage)) {
+    startPage = 1;
+    endPage = treeRoot->getDoc()->getNumPages();
+  }
+
+  treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse);
+  return mcdev.getTextSpans();
+}
+
 static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved)
 {
   // Circular reference
diff --git a/poppler/StructElement.h b/poppler/StructElement.h
index 00deef4..b9eef8a 100644
--- a/poppler/StructElement.h
+++ b/poppler/StructElement.h
@@ -17,6 +17,7 @@
 
 #include "goo/gtypes.h"
 #include "goo/GooString.h"
+#include "MarkedContentOutputDev.h"
 #include "Object.h"
 #include <vector>
 #include <set>
@@ -218,9 +219,36 @@ public:
   const GooString *getActualText() const { return isContent() ? NULL : s->actualText; }
   GooString *getActualText() { return isContent() ? NULL : s->actualText; }
 
+  // Content text referenced by the element:
+  //
+  // - For MCID reference elements, this is just the text of the
+  //   corresponding marked content object in the page stream, regardless
+  //   of the setting of the "recursive" flag.
+  // - For other elements, if the "recursive" flag is set, the text
+  //   enclosed by *all* the child MCID reference elements of the subtree
+  //   is returned. The text is assembled by traversing the leaf MCID
+  //   reference elements in logical order.
+  // - In any other case, the function returns NULL.
+  //
+  // A new string is returned, and the ownership passed to the caller.
+  //
+  GooString *getText(GBool recursive = gTrue) const {
+    return appendSubTreeText(NULL, recursive);
+  }
+
+  const TextSpanArray getTextSpans() const {
+    if (!isContent())
+      return TextSpanArray();
+    MarkedContentOutputDev mcdev(getMCID());
+    return getTextSpansInternal(mcdev);
+  }
+
   ~StructElement();
 
 private:
+  GooString* appendSubTreeText(GooString *string, GBool recursive) const;
+  const TextSpanArray& getTextSpansInternal(MarkedContentOutputDev& mcdev) const;
+
   typedef std::vector<Attribute*>     AttrPtrArray;
   typedef std::vector<StructElement*> ElemPtrArray;
 


More information about the poppler mailing list