[poppler] 2 commits - cpp/poppler-page.cpp cpp/poppler-page.h cpp/poppler-private.h cpp/tests

Albert Astals Cid aacid at kemper.freedesktop.org
Mon Feb 26 23:45:18 UTC 2018


 cpp/poppler-page.cpp       |   91 +++++++++++++++++++++++++++++++++++++++++++++
 cpp/poppler-page.h         |   58 ++++++++++++++++++++++++++++
 cpp/poppler-private.h      |    9 ++++
 cpp/tests/poppler-dump.cpp |   34 ++++++++++++++++
 4 files changed, 192 insertions(+)

New commits:
commit 2740b3aca81a6a8c690540fc141e5923a1fff460
Author: Albert Astals Cid <aacid at kde.org>
Date:   Tue Feb 27 00:47:04 2018 +0100

    cpp: Add since

diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index df5cb36a..93a13d18 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (C) 2009-2010, Pino Toscano <pino at kde.org>
  * Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
+ * Copyright (C) 2018, Albert Astals Cid <aacid at kde.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -111,6 +112,8 @@ public:
        up-to-down), the std::vector contains the text in the proper
        order.
 
+       \since 0.63
+
        \note The page object owns the text_box objects as unique_ptr,
              the caller is not needed to free them.
 
commit 42a6b8651f040f0960802e705b1aea82a956a63b
Author: suzuki toshiya <mpsuzuki at hiroshima-u.ac.jp>
Date:   Tue Feb 27 00:46:18 2018 +0100

    cpp: Add page::text_list

diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 8913c8eb..83d48f07 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -2,6 +2,7 @@
  * Copyright (C) 2009-2010, Pino Toscano <pino at kde.org>
  * Copyright (C) 2017, Albert Astals Cid <aacid at kde.org>
  * Copyright (C) 2017, Jason Alan Palmer <jalanpalmer at gmail.com>
+ * Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -285,3 +286,93 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
     }
     return ustring::from_utf8(s->getCString());
 }
+
+/*
+ * text_box object for page::text_list()
+ */
+text_box::~text_box() = default;
+
+text_box::text_box(text_box_data *data) : m_data{data}
+{
+}
+
+ustring text_box::text() const
+{
+    return m_data->text;
+}
+
+rectf text_box::bbox() const
+{
+    return m_data->bbox;
+}
+
+rectf text_box::char_bbox(size_t i) const
+{
+    if (i < m_data->char_bboxes.size())
+        return m_data->char_bboxes[i];
+    return rectf(0, 0, 0, 0);
+}
+
+bool text_box::has_space_after() const
+{
+    return m_data->has_space_after;
+}
+
+std::vector<text_box> page::text_list() const
+{
+    std::vector<text_box>  output_list;
+
+    /* config values are same with Qt5 Page::TextList() */
+    std::unique_ptr<TextOutputDev> output_dev{
+        new TextOutputDev(nullptr,    /* char* fileName */
+                          gFalse,  /* GBool physLayoutA */
+                          0,       /* double fixedPitchA */
+                          gFalse,  /* GBool rawOrderA */
+                          gFalse)  /* GBool append */
+    };
+
+    /*
+     * config values are same with Qt5 Page::TextList(),
+     * but rotation is fixed to zero.
+     * Few people use non-zero values.
+     */
+    d->doc->doc->displayPageSlice(output_dev.get(),
+                                  d->index + 1,           /* page */
+                                  72, 72, 0,              /* hDPI, vDPI, rot */
+                                  gFalse, gFalse, gFalse, /* useMediaBox, crop, printing */
+                                  -1, -1, -1, -1,         /* sliceX, sliceY, sliceW, sliceH */
+                                  nullptr, nullptr,       /* abortCheckCbk(), abortCheckCbkData */
+                                  nullptr, nullptr,       /* annotDisplayDecideCbk(), annotDisplayDecideCbkData */
+                                  gTrue);                 /* copyXRef */
+
+    if (std::unique_ptr< TextWordList > word_list{output_dev->makeWordList()}) {
+
+        output_list.reserve(word_list->getLength());
+        for (int i = 0; i < word_list->getLength(); i ++) {
+            TextWord *word = word_list->get(i);
+
+            std::unique_ptr<GooString> gooWord{word->getText()};
+            ustring ustr = detail::unicode_GooString_to_ustring(gooWord.get());
+
+            double xMin, yMin, xMax, yMax;
+            word->getBBox(&xMin, &yMin, &xMax, &yMax);
+
+            text_box tb{new text_box_data{
+                ustr,
+                {xMin, yMin, xMax-xMin, yMax-yMin},
+                {},
+                word->hasSpaceAfter() == gTrue
+            }};
+
+            tb.m_data->char_bboxes.reserve(word->getLength());
+            for (int j = 0; j < word->getLength(); j ++) {
+                word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax);
+                tb.m_data->char_bboxes.push_back({xMin, yMin, xMax-xMin, yMax-yMin});
+            }
+
+            output_list.push_back(std::move(tb));
+        }
+    }
+
+    return output_list;
+}
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 7b4298a1..df5cb36a 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009-2010, Pino Toscano <pino at kde.org>
+ * Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,9 +23,45 @@
 #include "poppler-global.h"
 #include "poppler-rectangle.h"
 
+#include <memory>
+
 namespace poppler
 {
 
+struct text_box_data;
+class POPPLER_CPP_EXPORT text_box
+{
+    friend class page;
+public:
+    text_box(text_box&&) = default;
+    text_box& operator=(text_box&&) = default;
+
+    ~text_box();
+
+    ustring   text() const;
+    rectf     bbox() const;
+
+    /**
+       Get a bbox for the i-th glyph
+
+       This method returns a rectf of the bounding box for
+       the i-th glyph in the text_box.
+
+       \note The text_box object owns the rectf objects,
+       the caller is not needed to free them.
+
+       \warning For too large glyph index, rectf(0,0,0,0)
+       is returned. The number of the glyphs and ustring
+       codepoints might be different in some complex scripts.
+     */
+    rectf     char_bbox(size_t i) const;
+    bool      has_space_after() const;
+private:
+    text_box(text_box_data *data);
+
+    std::unique_ptr<text_box_data> m_data;
+};
+
 class document;
 class document_private;
 class page_private;
@@ -63,6 +100,24 @@ public:
     ustring text(const rectf &rect = rectf()) const;
     ustring text(const rectf &rect, text_layout_enum layout_mode) const;
 
+    /**
+       Returns a list of text of the page
+
+       This method returns a std::vector of text_box that contain all
+       the text of the page, with roughly one text word of text
+       per text_box item.
+
+       For text written in western languages (left-to-right and
+       up-to-down), the std::vector contains the text in the proper
+       order.
+
+       \note The page object owns the text_box objects as unique_ptr,
+             the caller is not needed to free them.
+
+       \warning This method is not tested with Asian scripts
+    */
+    std::vector<text_box> text_list() const;
+
 private:
     page(document_private *doc, int index);
 
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 147073d9..3753567f 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -3,6 +3,7 @@
  * Copyright (C) 2013 Adrian Johnson <ajohnson at redneon.com>
  * Copyright (C) 2014, Hans-Peter Deifel <hpdeifel at gmx.de>
  * Copyright (C) 2016 Jakub Alba <jakubalba at gmail.com>
+ * Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -67,6 +68,14 @@ void delete_all(const Collection &c)
     delete_all(c.begin(), c.end());
 }
 
+struct text_box_data
+{
+    ustring text;
+    rectf bbox;
+    std::vector<rectf> char_bboxes;
+    bool has_space_after;
+};
+
 }
 
 #endif
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index c147aad7..a1a68251 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -2,6 +2,7 @@
  * Copyright (C) 2009-2010, Pino Toscano <pino at kde.org>
  * Copyright (C) 2017, 2018, Albert Astals Cid <aacid at kde.org>
  * Copyright (C) 2017, Jason Alan Palmer <jalanpalmer at gmail.com>
+ * Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -50,6 +51,7 @@ bool show_embedded_files = false;
 bool show_pages = false;
 bool show_help = false;
 char show_text[32];
+bool show_text_list = false;
 poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
 
 static const ArgDesc the_args[] = {
@@ -71,6 +73,8 @@ static const ArgDesc the_args[] = {
       "show pages information" },
     { "--show-text",           argString, &show_text,          sizeof(show_text),
       "show text (physical|raw) extracted from all pages" },
+    { "--show-text-list",      argFlag, &show_text_list,       0,
+      "show text list (experimental)" },
     { "-h",                    argFlag,  &show_help,           0,
       "print usage information" },
     { "--help",                argFlag,  &show_help,           0,
@@ -323,6 +327,28 @@ static void print_page_text(poppler::page *p)
     std::cout << std::endl;
 }
 
+static void print_page_text_list(poppler::page *p)
+{
+    if (!p) {
+        std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
+        std::cout << std::endl;
+        return;
+    }
+    auto text_list = p->text_list();
+
+    std::cout << "---" << std::endl;
+    for (size_t i = 0; i < text_list.size(); i ++) {
+        poppler::rectf bbox = text_list[i].bbox();
+        poppler::ustring ustr = text_list[i].text();
+        std::cout << "[" << ustr << "] @ ";
+        std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
+        std::cout << std::endl;
+
+    }
+    std::cout << "---" << std::endl;
+}
+
+
 int main(int argc, char *argv[])
 {
     if (!parseArgs(the_args, &argc, argv)
@@ -398,6 +424,14 @@ int main(int argc, char *argv[])
             print_page_text(p.get());
         }
     }
+    if (show_text_list) {
+        const int pages = doc->pages();
+        for (int i = 0; i < pages; ++i) {
+            std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
+            std::unique_ptr<poppler::page> p(doc->create_page(i));
+            print_page_text_list(p.get());
+        }
+    }
 
     return 0;
 }


More information about the poppler mailing list