[poppler] 10 commits - cpp/CMakeLists.txt cpp/poppler-font.cpp cpp/poppler-font.h cpp/poppler-font-private.h cpp/poppler-page.cpp cpp/poppler-page.h cpp/poppler-page-private.h cpp/poppler-private.h cpp/tests poppler/TextOutputDev.cc poppler/TextOutputDev.h

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue May 19 21:12:15 UTC 2020


 cpp/CMakeLists.txt         |    1 
 cpp/poppler-font-private.h |   82 +++++++++++++++++++++++++++++++
 cpp/poppler-font.cpp       |   56 ++-------------------
 cpp/poppler-font.h         |    3 +
 cpp/poppler-page-private.h |    5 +
 cpp/poppler-page.cpp       |  118 ++++++++++++++++++++++++++++++++++++++++++---
 cpp/poppler-page.h         |   78 +++++++++++++++++++++++++++++
 cpp/poppler-private.h      |   31 +++++++++++
 cpp/tests/poppler-dump.cpp |   20 +++++--
 poppler/TextOutputDev.cc   |    4 +
 poppler/TextOutputDev.h    |    1 
 11 files changed, 338 insertions(+), 61 deletions(-)

New commits:
commit 3189332012ca46998f8ffb872e7ed81c630c4c7a
Author: suzuki toshiya <mpsuzuki at hiroshima-u.ac.jp>
Date:   Sat May 16 04:54:55 2020 +0000

    [cpp] separate the font info in text_box to another struct.
    
    * add new API, page::text_list(int opt_flag). The old one
    taking no argument is kept for ABI compatibility.
    The opt_flag is a bitmask-multiple of the new enum,
    page::text_list_option_enum.
    
    * text_box.m_data->text_box_font is an unique pointer to
    the storage (if text_list() requests the font info), or
    just a null pointer (if text_list() does not request the
    font info).
    
    * new option "--show-text-list-with-font" showing font
    info, to tests/poppler-dump.cpp. "--show-text-list"
    does not load the font info at all.
    
    Co-authored-by: Adam Reichold <adam.reichold at t-online.de>
    Co-authored-by: Albert Astals Cid <aacid at kde.org>

diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index f274ca5b..01b0409d 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -299,7 +299,7 @@ static void appendToGooString(void *stream, const char *text, int len) {
 ustring page::text(const rectf &r, text_layout_enum layout_mode) const
 {
     std::unique_ptr<GooString> out(new GooString());
-    const bool use_raw_order = (layout_mode == raw_order_layout); 
+    const bool use_raw_order = (layout_mode == raw_order_layout);
     const bool use_physical_layout = (layout_mode == physical_layout);
     TextOutputDev td(&appendToGooString, out.get(), use_physical_layout, 0, use_raw_order, false);
     if (r.is_empty()) {
@@ -311,6 +311,11 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
     return ustring::from_utf8(out->c_str());
 }
 
+/*
+ * text_box_font_info object for text_box
+ */
+text_box_font_info_data::~text_box_font_info_data() = default;
+
 /*
  * text_box object for page::text_list()
  */
@@ -352,30 +357,41 @@ bool text_box::has_space_after() const
     return m_data->has_space_after;
 }
 
+bool text_box::has_font_info() const
+{
+    return (m_data->text_box_font != nullptr);
+}
+
 text_box::writing_mode_enum text_box::get_wmode(int i) const
 {
-    return m_data->wmodes[i];
+    if (this->has_font_info())
+        return m_data->text_box_font->wmodes[i];
+    else
+        return text_box::invalid_wmode;
 }
 
 double text_box::get_font_size() const
 {
-    return m_data->font_size;
+    if (this->has_font_info())
+        return m_data->text_box_font->font_size;
+    else
+        return -1;
 }
 
 std::string text_box::get_font_name(int i) const
 {
-    int j = m_data->glyph_to_cache_index[i];
+    if (!this->has_font_info())
+        return std::string("*ignored*");
+
+    int j = m_data->text_box_font->glyph_to_cache_index[i];
     if (j < 0) {
         return std::string("");
     }
-    return m_data->font_info_cache[j].name();
+    return m_data->text_box_font->font_info_cache[j].name();
 }
 
-
-std::vector<text_box> page::text_list() const
+std::vector<text_box> page::text_list(int opt_flag) const
 {
-    d->init_font_info_cache();
-
     std::vector<text_box>  output_list;
 
     /* config values are same with Qt5 Page::TextList() */
@@ -419,41 +435,55 @@ std::vector<text_box> page::text_list() const
                 word->getRotation(),
                 {},
                 word->hasSpaceAfter() == true,
-                {},
-                word->getFontSize(),
-                d->font_info_cache,
-                {}
+                nullptr
             }};
 
+            std::unique_ptr<text_box_font_info_data> tb_font_info = nullptr;
+            if (opt_flag & page::text_list_include_font) {
+                d->init_font_info_cache();
+
+                std::unique_ptr<text_box_font_info_data> tb_font{new text_box_font_info_data{
+                    word->getFontSize(), // double font_size
+                    {},                  // std::vector<text_box::writing_mode> wmodes;
+                    d->font_info_cache,  // std::vector<font_info> font_info_cache;
+                    {}                   // std::vector<int> glyph_to_cache_index;
+                }};
+
+                tb_font_info = std::move(tb_font);
+            };
+
             tb.m_data->char_bboxes.reserve(word->getLength());
             for (int j = 0; j < word->getLength(); j ++) {
                 word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax);
                 tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin);
             }
 
-            tb.m_data->glyph_to_cache_index.reserve(word->getLength());
-            for (int j = 0; j < word->getLength(); j++) {
-                const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
-
-                // filter-out the invalid WMode value here.
-                switch (cur_text_font_info->getWMode()) {
-                case 0:
-                    tb.m_data->wmodes.push_back(text_box::horizontal_wmode);
-                    break;
-                case 1:
-                    tb.m_data->wmodes.push_back(text_box::vertical_wmode);
-                    break;
-                default:
-                    tb.m_data->wmodes.push_back(text_box::invalid_wmode);
-                };
-
-                tb.m_data->glyph_to_cache_index[j] = -1;
-                for (size_t k = 0; k < d->font_info_cache.size(); k++) {
-                    if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) {
-                        tb.m_data->glyph_to_cache_index[j] = k;
+            if (tb_font_info && d->font_info_cache_initialized) {
+                tb_font_info->glyph_to_cache_index.reserve(word->getLength());
+                for (int j = 0; j < word->getLength(); j++) {
+                    const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
+
+                    // filter-out the invalid WMode value here.
+                    switch (cur_text_font_info->getWMode()) {
+                    case 0:
+                        tb_font_info->wmodes.push_back(text_box::horizontal_wmode);
                         break;
+                    case 1:
+                        tb_font_info->wmodes.push_back(text_box::vertical_wmode);
+                        break;
+                    default:
+                        tb_font_info->wmodes.push_back(text_box::invalid_wmode);
+                    };
+
+                    tb_font_info->glyph_to_cache_index[j] = -1;
+                    for (size_t k = 0; k < tb_font_info->font_info_cache.size(); k++) {
+                        if (cur_text_font_info->matches(&(tb_font_info->font_info_cache[k].d->ref))) {
+                            tb_font_info->glyph_to_cache_index[j] = k;
+                            break;
+                        }
                     }
                 }
+                tb.m_data->text_box_font = std::move(tb_font_info);
             }
 
             output_list.push_back(std::move(tb));
@@ -462,3 +492,8 @@ std::vector<text_box> page::text_list() const
 
     return output_list;
 }
+
+std::vector<text_box> page::text_list() const
+{
+    return text_list(0);
+}
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index ca5be2fd..dd6ebf2c 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -66,6 +66,12 @@ public:
     rectf     char_bbox(size_t i) const;
     bool      has_space_after() const;
 
+
+    /**
+      \since 0.89
+     */
+    bool      has_font_info() const;
+
     /**
        Get a writing mode for the i-th glyph
 
@@ -186,6 +192,22 @@ public:
     */
     std::vector<text_box> text_list() const;
 
+    /*
+     * text_list_option_enum is a bitmask-style flags for text_list(),
+     * 0 means the default & simplest behaviour.
+     */
+    enum text_list_option_enum {
+        text_list_include_font = 1 // \since 0.89
+    };
+
+    /**
+       Extended version of text_list() taking an option flag.
+       The option flag should be the multiple of text_list_option_enum.
+
+       \since 0.89
+    */
+    std::vector<text_box> text_list(int opt_flag) const;
+
 private:
     page(document_private *doc, int index);
 
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 83e46319..4ec159a8 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -73,23 +73,17 @@ void delete_all(const Collection &c)
 }
 
 class font_info;
-struct text_box_data
+struct text_box_font_info_data
 {
-    ~text_box_data();
-
-    ustring text;
-    rectf bbox;
-    int rotation;
-    std::vector<rectf> char_bboxes;
-    bool has_space_after;
+    ~text_box_font_info_data();
 
-    std::vector<text_box::writing_mode_enum> wmodes; 
     double font_size;
+    std::vector<text_box::writing_mode_enum> wmodes; 
 
     /*
      * a duplication of the font_info_cache created by the
      * poppler::font_iterator and owned by the poppler::page
-     *  object. Its lifetime might differ from that of text_box
+     * object. Its lifetime might differ from that of text_box
      * object (think about collecting all text_box objects
      * from all pages), so we have to duplicate it into all
      * text_box instances.
@@ -97,7 +91,7 @@ struct text_box_data
     std::vector<font_info> font_info_cache;
 
     /*
-     * a std::vector from the glyph index in the current
+     * a std::vector from the glyph index in the owner
      * text_box to the font_info index in font_info_cache. 
      * The "-1" means no corresponding fonts found in the
      * cache.
@@ -105,6 +99,20 @@ struct text_box_data
     std::vector<int> glyph_to_cache_index;
 };
 
+class font_info;
+struct text_box_data
+{
+    ~text_box_data();
+
+    ustring text;
+    rectf bbox;
+    int rotation;
+    std::vector<rectf> char_bboxes;
+    bool has_space_after;
+
+    std::unique_ptr<text_box_font_info_data> text_box_font;
+};
+
 }
 
 #endif
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index 7864979e..ef391d78 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -60,6 +60,7 @@ bool show_help = false;
 bool show_version = false;
 char show_text[32];
 bool show_text_list = false;
+bool show_text_list_with_font = false;
 poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
 
 static const ArgDesc the_args[] = {
@@ -85,6 +86,8 @@ static const ArgDesc the_args[] = {
       "show text (physical|raw|none) extracted from all pages" },
     { "--show-text-list",      argFlag, &show_text_list,       0,
       "show text list (experimental)" },
+    { "--show-text-list-with-font",  argFlag, &show_text_list_with_font, 0,
+      "show text list with font info (experimental)" },
     { "-h",                    argFlag,  &show_help,           0,
       "print usage information" },
     { "--help",                argFlag,  &show_help,           0,
@@ -417,14 +420,14 @@ static void print_page_text(poppler::page *p)
     std::cout << std::endl;
 }
 
-static void print_page_text_list(poppler::page *p)
+static void print_page_text_list(poppler::page *p, int opt_flag = 0)
 {
     if (!p) {
         std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
         std::cout << std::endl;
         return;
     }
-    auto text_list = p->text_list();
+    auto text_list = p->text_list(opt_flag);
 
     std::cout << "---" << std::endl;
     for (const poppler::text_box &text : text_list) {
@@ -435,9 +438,9 @@ static void print_page_text_list(poppler::page *p)
         std::string font_name = text.get_font_name();
         std::cout << "[" << ustr << "] @ ";
         std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
-        std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
+        if (text.has_font_info())
+            std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
         std::cout << std::endl;
-
     }
     std::cout << "---" << std::endl;
 }
@@ -538,12 +541,15 @@ int main(int argc, char *argv[])
             print_page_text(p.get());
         }
     }
-    if (show_text_list) {
+    if (show_text_list || show_text_list_with_font) {
         const int pages = doc->pages();
         for (int i = 0; i < pages; ++i) {
             std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
             std::unique_ptr<poppler::page> p(doc->create_page(i));
-            print_page_text_list(p.get());
+            if (show_text_list_with_font)
+                print_page_text_list(p.get(), poppler::page::text_list_include_font);
+            else
+                print_page_text_list(p.get(), 0);
         }
     }
 
commit 437553ecb26948f77c3dbf7ad29bca86ffff7f6e
Author: Albert Astals Cid <aacid at kde.org>
Date:   Fri May 15 12:57:32 2020 +0000

    [cpp] change page_private::init_font_info_cache() to a void method.
    
    We already have a boolean font_info_cache_initialized, no need to
    guess the initialization result by the size of initialized cache.

diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h
index d4954e9d..442f8bb1 100644
--- a/cpp/poppler-page-private.h
+++ b/cpp/poppler-page-private.h
@@ -50,7 +50,7 @@ public:
 
     std::vector<font_info> font_info_cache;
     bool font_info_cache_initialized;
-    size_t init_font_info_cache();
+    void init_font_info_cache();
 };
 
 }
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index e44ef26e..f274ca5b 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -57,10 +57,10 @@ page_private::~page_private()
     delete transition;
 }
 
-size_t page_private::init_font_info_cache()
+void page_private::init_font_info_cache()
 {
     if (font_info_cache_initialized)
-	return font_info_cache.size();
+	return;
 
     poppler::font_iterator it(index, doc);
 
@@ -69,7 +69,7 @@ size_t page_private::init_font_info_cache()
     }
 
     font_info_cache_initialized = true;
-    return font_info_cache.size();
+    return;
 }
 
 /**
commit 57de32198a4406eae18b80eed42e6050e2b48cca
Author: Albert Astals Cid <aacid at kde.org>
Date:   Fri May 15 12:23:50 2020 +0000

    [cpp] in poppler-page.h, add "since 0.89" comment to 3 new methods.

diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 50ccdb06..ca5be2fd 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -66,10 +66,6 @@ public:
     rectf     char_bbox(size_t i) const;
     bool      has_space_after() const;
 
-    /**
-      \since 0.8x
-     */
-
     /**
        Get a writing mode for the i-th glyph
 
@@ -85,6 +81,10 @@ public:
         horizontal_wmode = 0,
         vertical_wmode = 1
     };
+
+    /**
+      \since 0.89
+     */
     writing_mode_enum get_wmode(int i = 0) const;
 
     /**
@@ -93,6 +93,10 @@ public:
        This method return a double floating value of the
        font size from the text_box instance.
      */
+
+    /**
+      \since 0.89
+     */
     double     get_font_size() const;
 
     /**
@@ -111,6 +115,10 @@ public:
        Latin1 or UTF-8. Some legacy PDF producers used
        in CJK market use GBK, Big5, Wansung or Shift-JIS.
      */
+
+    /**
+      \since 0.89
+     */
     std::string get_font_name(int i = 0) const;
 
 private:
commit 507027de297f43146f5bbebe8d098dededffc577
Author: suzuki toshiya <mpsuzuki at hiroshima-u.ac.jp>
Date:   Tue May 5 10:11:49 2020 +0000

    [cpp] introduce a boolean font_info_cache_initialized, to distinguish an initialized-but-empty cache from the uninitialized cache
    
    Co-authored-by: Adam Reichold <adam.reichold at t-online.de>

diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h
index 3e2ee914..d4954e9d 100644
--- a/cpp/poppler-page-private.h
+++ b/cpp/poppler-page-private.h
@@ -49,6 +49,7 @@ public:
     { return const_cast<poppler::page *>(p)->d; }
 
     std::vector<font_info> font_info_cache;
+    bool font_info_cache_initialized;
     size_t init_font_info_cache();
 };
 
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index b0bf847e..e44ef26e 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -48,6 +48,7 @@ page_private::page_private(document_private *_doc, int _index)
     , page(doc->doc->getCatalog()->getPage(_index + 1))
     , index(_index)
     , transition(nullptr)
+    , font_info_cache_initialized(false)
 {
 }
 
@@ -58,7 +59,7 @@ page_private::~page_private()
 
 size_t page_private::init_font_info_cache()
 {
-    if (font_info_cache.size() > 0)
+    if (font_info_cache_initialized)
 	return font_info_cache.size();
 
     poppler::font_iterator it(index, doc);
@@ -67,6 +68,7 @@ size_t page_private::init_font_info_cache()
 	font_info_cache = it.next();
     }
 
+    font_info_cache_initialized = true;
     return font_info_cache.size();
 }
 
commit 2cd79c7382888559d5d8dcc56a84572ac8a77086
Author: Adam Reichold <adam.reichold at t-online.de>
Date:   Tue May 5 01:22:29 2020 +0000

    [cpp] construct a font_iterator instance in the local storage of page_private::init_font_info_cache() method, instead of the heap

diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 3aa6222b..b0bf847e 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -61,14 +61,12 @@ size_t page_private::init_font_info_cache()
     if (font_info_cache.size() > 0)
 	return font_info_cache.size();
 
-    poppler::font_iterator* font_iterator = new poppler::font_iterator(index, doc);
+    poppler::font_iterator it(index, doc);
 
-    if (font_iterator->has_next()) {
-	font_info_cache = font_iterator->next();
+    if (it.has_next()) {
+	font_info_cache = it.next();
     }
 
-    delete font_iterator;
-
     return font_info_cache.size();
 }
 
commit 7279b4eb397667cd4553f5852286b3f3d73a1a83
Author: Adam Reichold <adam.reichold at t-online.de>
Date:   Mon May 4 11:51:55 2020 +0000

    [cpp] remove wrong warning note for about the std::string object returned by text_box::get_font_name()

diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 9db6f87b..50ccdb06 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -110,10 +110,6 @@ public:
        encoding of the font name is one of the ASCII,
        Latin1 or UTF-8. Some legacy PDF producers used
        in CJK market use GBK, Big5, Wansung or Shift-JIS.
-
-       \warning The returned std::string is owned by the
-       text_box instance, it should not be used in the
-       other objects or should not be destroyed directly.
      */
     std::string get_font_name(int i = 0) const;
 
commit af3805f0b60289c7f522da29f9375119a1cd778a
Author: Albert Astals Cid <aacid at kde.org>
Date:   Mon May 4 04:32:27 2020 +0000

    [cpp] new enum poppler::text_box::writing_mode_enum

diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 715c5ec4..3aa6222b 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -352,7 +352,7 @@ bool text_box::has_space_after() const
     return m_data->has_space_after;
 }
 
-int text_box::get_wmode(int i) const
+text_box::writing_mode_enum text_box::get_wmode(int i) const
 {
     return m_data->wmodes[i];
 }
@@ -434,7 +434,18 @@ std::vector<text_box> page::text_list() const
             tb.m_data->glyph_to_cache_index.reserve(word->getLength());
             for (int j = 0; j < word->getLength(); j++) {
                 const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
-                tb.m_data->wmodes.push_back(cur_text_font_info->getWMode()); 
+
+                // filter-out the invalid WMode value here.
+                switch (cur_text_font_info->getWMode()) {
+                case 0:
+                    tb.m_data->wmodes.push_back(text_box::horizontal_wmode);
+                    break;
+                case 1:
+                    tb.m_data->wmodes.push_back(text_box::vertical_wmode);
+                    break;
+                default:
+                    tb.m_data->wmodes.push_back(text_box::invalid_wmode);
+                };
 
                 tb.m_data->glyph_to_cache_index[j] = -1;
                 for (size_t k = 0; k < d->font_info_cache.size(); k++) {
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 6f9e755d..9db6f87b 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -73,17 +73,19 @@ public:
     /**
        Get a writing mode for the i-th glyph
 
-       This method returns an integer of the writing mode
+       This method returns an enum of the writing mode
        for the i-th glyph in the text_box.
 
-       0 means the horizontal writing mode.
-       1 means the vertical writing mode.
-
        \note Usually all glyphs in one text_box have the
        same writing mode. Thus the default value of the
        glyph index is 0.
      */
-    int        get_wmode(int i = 0) const;
+    enum writing_mode_enum {
+        invalid_wmode = -1,
+        horizontal_wmode = 0,
+        vertical_wmode = 1
+    };
+    writing_mode_enum get_wmode(int i = 0) const;
 
     /**
        Get a font size of this text_box instance.
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 0fe33d3f..83e46319 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -28,6 +28,7 @@
 
 #include "poppler-global.h"
 #include "poppler-rectangle.h"
+#include "poppler-page.h" // to use text_box::writing_mode_enum
 
 #include "Error.h"
 #include "CharTypes.h"
@@ -82,7 +83,7 @@ struct text_box_data
     std::vector<rectf> char_bboxes;
     bool has_space_after;
 
-    std::vector<int> wmodes; 
+    std::vector<text_box::writing_mode_enum> wmodes; 
     double font_size;
 
     /*
commit 65053f43dbb83b66302bddda27732168fc74cca1
Author: Albert Astals Cid <aacid at kde.org>
Date:   Sun May 3 16:21:38 2020 +0000

    [TextOutputDev] simplify TextFontInfo::matches(const Ref *ref)

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 84af8af2..993a37da 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -342,7 +342,7 @@ bool TextFontInfo::matches(const TextFontInfo *fontInfo) const {
 }
 
 bool TextFontInfo::matches(const Ref *ref) const {
-  return (gfxFont->getID()->num == ref->num && gfxFont->getID()->gen == ref->gen);
+  return (*(gfxFont->getID()) == *ref);
 }
 
 double TextFontInfo::getAscent() const {
commit 4ea2e879d4e0e9a5d899adb82bbdaab9e505532c
Author: Albert Astals Cid <aacid at kde.org>
Date:   Sun May 3 16:17:11 2020 +0000

    [cpp] simplify the initialization of poppler::font_info_private.ref and .emb_ref

diff --git a/cpp/poppler-font-private.h b/cpp/poppler-font-private.h
index b24cbaf0..aa26e2f9 100644
--- a/cpp/poppler-font-private.h
+++ b/cpp/poppler-font-private.h
@@ -48,10 +48,8 @@ public:
             font_file = fi->getFile()->c_str();
         }
 
-        ref.num = fi->getRef().num;
-        ref.gen = fi->getRef().gen;
-        emb_ref.num = fi->getEmbRef().num;
-        emb_ref.gen = fi->getEmbRef().gen;
+        ref = fi->getRef();
+        emb_ref = fi->getEmbRef();
     }
 
     std::string font_name;
commit 60400514324d6e5d0a1c50ce4af84320d350e967
Author: suzuki toshiya <mpsuzuki at hiroshima-u.ac.jp>
Date:   Fri May 1 08:04:14 2020 +0000

    [cpp] Add the font infos to the text_box object.

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 627920ff..32b3ef88 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -38,6 +38,7 @@ install(FILES
   poppler-document.h
   poppler-embedded-file.h
   poppler-font.h
+  poppler-font-private.h
   poppler-global.h
   poppler-image.h
   poppler-page.h
diff --git a/cpp/poppler-font-private.h b/cpp/poppler-font-private.h
new file mode 100644
index 00000000..b24cbaf0
--- /dev/null
+++ b/cpp/poppler-font-private.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2009, Pino Toscano <pino at kde.org>
+ * Copyright (C) 2015, Tamas Szekeres <szekerest at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "poppler-font.h"
+
+#include "poppler-document-private.h"
+
+#include "FontInfo.h"
+
+#include <algorithm>
+
+using namespace poppler;
+
+class poppler::font_info_private
+{
+public:
+    font_info_private()
+        : type(font_info::unknown)
+        , is_embedded(false)
+        , is_subset(false)
+    {
+    }
+    font_info_private(FontInfo *fi)
+        : type((font_info::type_enum)fi->getType())
+        , is_embedded(fi->getEmbedded())
+        , is_subset(fi->getSubset())
+    {
+        if (fi->getName()) {
+            font_name = fi->getName()->c_str();
+        }
+        if (fi->getFile()) {
+            font_file = fi->getFile()->c_str();
+        }
+
+        ref.num = fi->getRef().num;
+        ref.gen = fi->getRef().gen;
+        emb_ref.num = fi->getEmbRef().num;
+        emb_ref.gen = fi->getEmbRef().gen;
+    }
+
+    std::string font_name;
+    std::string font_file;
+    font_info::type_enum type : 5;
+    bool is_embedded : 1;
+    bool is_subset : 1;
+
+    Ref ref;
+    Ref emb_ref;
+};
+
+
+class poppler::font_iterator_private
+{
+public:
+    font_iterator_private(int start_page, document_private *dd)
+        : font_info_scanner(dd->doc, start_page)
+        , total_pages(dd->doc->getNumPages())
+        , current_page((std::max)(start_page, 0))
+    {
+    }
+    ~font_iterator_private()
+    {
+    }
+
+    FontInfoScanner font_info_scanner;
+    int total_pages;
+    int current_page;
+};
diff --git a/cpp/poppler-font.cpp b/cpp/poppler-font.cpp
index 6d833c19..e8a4076f 100644
--- a/cpp/poppler-font.cpp
+++ b/cpp/poppler-font.cpp
@@ -24,6 +24,8 @@
  */
 #include "poppler-font.h"
 
+#include "poppler-font-private.h"
+
 #include "poppler-document-private.h"
 
 #include "FontInfo.h"
@@ -32,54 +34,6 @@
 
 using namespace poppler;
 
-class poppler::font_info_private
-{
-public:
-    font_info_private()
-        : type(font_info::unknown)
-        , is_embedded(false)
-        , is_subset(false)
-    {
-    }
-    font_info_private(FontInfo *fi)
-        : type((font_info::type_enum)fi->getType())
-        , is_embedded(fi->getEmbedded())
-        , is_subset(fi->getSubset())
-    {
-        if (fi->getName()) {
-            font_name = fi->getName()->c_str();
-        }
-        if (fi->getFile()) {
-            font_file = fi->getFile()->c_str();
-        }
-    }
-
-    std::string font_name;
-    std::string font_file;
-    font_info::type_enum type : 5;
-    bool is_embedded : 1;
-    bool is_subset : 1;
-};
-
-
-class poppler::font_iterator_private
-{
-public:
-    font_iterator_private(int start_page, document_private *dd)
-        : font_info_scanner(dd->doc, start_page)
-        , total_pages(dd->doc->getNumPages())
-        , current_page((std::max)(start_page, 0))
-    {
-    }
-    ~font_iterator_private()
-    {
-    }
-
-    FontInfoScanner font_info_scanner;
-    int total_pages;
-    int current_page;
-};
-
 /**
  \class poppler::font_info poppler-font.h "poppler/cpp/poppler-font.h"
 
@@ -208,7 +162,7 @@ font_iterator::~font_iterator()
 }
 
 /**
- Returns the fonts of the current page and advances to the next one.
+ \returns the fonts of the current page and advances to the next one.
  */
 std::vector<font_info> font_iterator::next()
 {
@@ -218,6 +172,10 @@ std::vector<font_info> font_iterator::next()
 
     ++d->current_page;
 
+    /* FontInfoScanner::scan() receives a number how many pages to
+     * be scanned from the *current page*, not from the beginning.
+     * We restrict the font scanning to the current page only.
+     */
     const std::vector<FontInfo*> items = d->font_info_scanner.scan(1);
     std::vector<font_info> fonts;
     fonts.reserve(items.size());
diff --git a/cpp/poppler-font.h b/cpp/poppler-font.h
index 854b7a40..27667e78 100644
--- a/cpp/poppler-font.h
+++ b/cpp/poppler-font.h
@@ -67,6 +67,7 @@ private:
 
     font_info_private *d;
     friend class font_iterator;
+    friend class page;
 };
 
 
@@ -84,6 +85,8 @@ private:
 
     font_iterator_private *d;
     friend class document;
+    friend class page;
+    friend class page_private;
 };
 
 }
diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h
index e0c3446d..3e2ee914 100644
--- a/cpp/poppler-page-private.h
+++ b/cpp/poppler-page-private.h
@@ -29,6 +29,7 @@ namespace poppler
 
 class document_private;
 class page_transition;
+class font_info;
 
 class page_private
 {
@@ -46,6 +47,9 @@ public:
 
     static inline page_private* get(const poppler::page *p)
     { return const_cast<poppler::page *>(p)->d; }
+
+    std::vector<font_info> font_info_cache;
+    size_t init_font_info_cache();
 };
 
 }
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 7150cd78..715c5ec4 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -32,6 +32,8 @@
 #include "poppler-document-private.h"
 #include "poppler-page-private.h"
 #include "poppler-private.h"
+#include "poppler-font-private.h"
+#include "poppler-font.h"
 
 #include "TextOutputDev.h"
 
@@ -54,6 +56,22 @@ page_private::~page_private()
     delete transition;
 }
 
+size_t page_private::init_font_info_cache()
+{
+    if (font_info_cache.size() > 0)
+	return font_info_cache.size();
+
+    poppler::font_iterator* font_iterator = new poppler::font_iterator(index, doc);
+
+    if (font_iterator->has_next()) {
+	font_info_cache = font_iterator->next();
+    }
+
+    delete font_iterator;
+
+    return font_info_cache.size();
+}
+
 /**
  \class poppler::page poppler-page.h "poppler/cpp/poppler-page.h"
 
@@ -334,17 +352,39 @@ bool text_box::has_space_after() const
     return m_data->has_space_after;
 }
 
+int text_box::get_wmode(int i) const
+{
+    return m_data->wmodes[i];
+}
+
+double text_box::get_font_size() const
+{
+    return m_data->font_size;
+}
+
+std::string text_box::get_font_name(int i) const
+{
+    int j = m_data->glyph_to_cache_index[i];
+    if (j < 0) {
+        return std::string("");
+    }
+    return m_data->font_info_cache[j].name();
+}
+
+
 std::vector<text_box> page::text_list() const
 {
+    d->init_font_info_cache();
+
     std::vector<text_box>  output_list;
 
     /* config values are same with Qt5 Page::TextList() */
     auto output_dev = std::make_unique<TextOutputDev>(
-	nullptr,    /* char* fileName */
-	false,  /* bool physLayoutA */
+	nullptr, /* char* fileName */
+	false,   /* bool physLayoutA */
 	0,       /* double fixedPitchA */
-	false,  /* bool rawOrderA */
-	false  /* bool append */
+	false,   /* bool rawOrderA */
+	false    /* bool append */
     );
 
     /*
@@ -378,7 +418,11 @@ std::vector<text_box> page::text_list() const
                 {xMin, yMin, xMax-xMin, yMax-yMin},
                 word->getRotation(),
                 {},
-                word->hasSpaceAfter() == true
+                word->hasSpaceAfter() == true,
+                {},
+                word->getFontSize(),
+                d->font_info_cache,
+                {}
             }};
 
             tb.m_data->char_bboxes.reserve(word->getLength());
@@ -387,6 +431,20 @@ std::vector<text_box> page::text_list() const
                 tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin);
             }
 
+            tb.m_data->glyph_to_cache_index.reserve(word->getLength());
+            for (int j = 0; j < word->getLength(); j++) {
+                const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
+                tb.m_data->wmodes.push_back(cur_text_font_info->getWMode()); 
+
+                tb.m_data->glyph_to_cache_index[j] = -1;
+                for (size_t k = 0; k < d->font_info_cache.size(); k++) {
+                    if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) {
+                        tb.m_data->glyph_to_cache_index[j] = k;
+                        break;
+                    }
+                }
+            }
+
             output_list.push_back(std::move(tb));
         }
     }
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 30ede302..6f9e755d 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -65,6 +65,56 @@ public:
      */
     rectf     char_bbox(size_t i) const;
     bool      has_space_after() const;
+
+    /**
+      \since 0.8x
+     */
+
+    /**
+       Get a writing mode for the i-th glyph
+
+       This method returns an integer of the writing mode
+       for the i-th glyph in the text_box.
+
+       0 means the horizontal writing mode.
+       1 means the vertical writing mode.
+
+       \note Usually all glyphs in one text_box have the
+       same writing mode. Thus the default value of the
+       glyph index is 0.
+     */
+    int        get_wmode(int i = 0) const;
+
+    /**
+       Get a font size of this text_box instance.
+
+       This method return a double floating value of the
+       font size from the text_box instance.
+     */
+    double     get_font_size() const;
+
+    /**
+       Get a font name for the i-th glyph
+
+       This method returns a std::string object holding
+       the font name for the i-th glyph.
+
+       \note The randomization prefix of the embedded fonts
+       are not removed. The font names including these
+       prefixes are insuffucient to determine whether the
+       two fonts are same or different.
+
+       \note The clients should not assume that the
+       encoding of the font name is one of the ASCII,
+       Latin1 or UTF-8. Some legacy PDF producers used
+       in CJK market use GBK, Big5, Wansung or Shift-JIS.
+
+       \warning The returned std::string is owned by the
+       text_box instance, it should not be used in the
+       other objects or should not be destroyed directly.
+     */
+    std::string get_font_name(int i = 0) const;
+
 private:
     text_box(text_box_data *data);
 
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index b9bc9b52..0fe33d3f 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -71,6 +71,7 @@ void delete_all(const Collection &c)
     delete_all(c.begin(), c.end());
 }
 
+class font_info;
 struct text_box_data
 {
     ~text_box_data();
@@ -80,6 +81,27 @@ struct text_box_data
     int rotation;
     std::vector<rectf> char_bboxes;
     bool has_space_after;
+
+    std::vector<int> wmodes; 
+    double font_size;
+
+    /*
+     * a duplication of the font_info_cache created by the
+     * poppler::font_iterator and owned by the poppler::page
+     *  object. Its lifetime might differ from that of text_box
+     * object (think about collecting all text_box objects
+     * from all pages), so we have to duplicate it into all
+     * text_box instances.
+     */
+    std::vector<font_info> font_info_cache;
+
+    /*
+     * a std::vector from the glyph index in the current
+     * text_box to the font_info index in font_info_cache. 
+     * The "-1" means no corresponding fonts found in the
+     * cache.
+     */
+    std::vector<int> glyph_to_cache_index;
 };
 
 }
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index 6196b675..7864979e 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -430,8 +430,12 @@ static void print_page_text_list(poppler::page *p)
     for (const poppler::text_box &text : text_list) {
         poppler::rectf bbox = text.bbox();
         poppler::ustring ustr = text.text();
+        int wmode = text.get_wmode();
+        double font_size = text.get_font_size();
+        std::string font_name = text.get_font_name();
         std::cout << "[" << ustr << "] @ ";
         std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
+        std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
         std::cout << std::endl;
 
     }
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index c7c8f852..84af8af2 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -341,6 +341,10 @@ bool TextFontInfo::matches(const TextFontInfo *fontInfo) const {
   return gfxFont == fontInfo->gfxFont;
 }
 
+bool TextFontInfo::matches(const Ref *ref) const {
+  return (gfxFont->getID()->num == ref->num && gfxFont->getID()->gen == ref->gen);
+}
+
 double TextFontInfo::getAscent() const {
   return gfxFont ? gfxFont->getAscent() : 0.95;
 }
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 0d008b3d..62c95b0f 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -91,6 +91,7 @@ public:
 
   bool matches(const GfxState *state) const;
   bool matches(const TextFontInfo *fontInfo) const;
+  bool matches(const Ref *ref) const;
 
   // Get the font ascent, or a default value if the font is not set
   double getAscent() const;


More information about the poppler mailing list