[poppler] 10 commits - cpp/CMakeLists.txt cpp/poppler-font.cpp cpp/poppler-font.h cpp/poppler-font-private.h cpp/poppler-page.cpp cpp/poppler-page.h cpp/poppler-page-private.h cpp/poppler-private.h cpp/tests poppler/TextOutputDev.cc poppler/TextOutputDev.h
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Tue May 19 21:12:15 UTC 2020
cpp/CMakeLists.txt | 1
cpp/poppler-font-private.h | 82 +++++++++++++++++++++++++++++++
cpp/poppler-font.cpp | 56 ++-------------------
cpp/poppler-font.h | 3 +
cpp/poppler-page-private.h | 5 +
cpp/poppler-page.cpp | 118 ++++++++++++++++++++++++++++++++++++++++++---
cpp/poppler-page.h | 78 +++++++++++++++++++++++++++++
cpp/poppler-private.h | 31 +++++++++++
cpp/tests/poppler-dump.cpp | 20 +++++--
poppler/TextOutputDev.cc | 4 +
poppler/TextOutputDev.h | 1
11 files changed, 338 insertions(+), 61 deletions(-)
New commits:
commit 3189332012ca46998f8ffb872e7ed81c630c4c7a
Author: suzuki toshiya <mpsuzuki at hiroshima-u.ac.jp>
Date: Sat May 16 04:54:55 2020 +0000
[cpp] separate the font info in text_box to another struct.
* add new API, page::text_list(int opt_flag). The old one
taking no argument is kept for ABI compatibility.
The opt_flag is a bitmask-multiple of the new enum,
page::text_list_option_enum.
* text_box.m_data->text_box_font is an unique pointer to
the storage (if text_list() requests the font info), or
just a null pointer (if text_list() does not request the
font info).
* new option "--show-text-list-with-font" showing font
info, to tests/poppler-dump.cpp. "--show-text-list"
does not load the font info at all.
Co-authored-by: Adam Reichold <adam.reichold at t-online.de>
Co-authored-by: Albert Astals Cid <aacid at kde.org>
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index f274ca5b..01b0409d 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -299,7 +299,7 @@ static void appendToGooString(void *stream, const char *text, int len) {
ustring page::text(const rectf &r, text_layout_enum layout_mode) const
{
std::unique_ptr<GooString> out(new GooString());
- const bool use_raw_order = (layout_mode == raw_order_layout);
+ const bool use_raw_order = (layout_mode == raw_order_layout);
const bool use_physical_layout = (layout_mode == physical_layout);
TextOutputDev td(&appendToGooString, out.get(), use_physical_layout, 0, use_raw_order, false);
if (r.is_empty()) {
@@ -311,6 +311,11 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
return ustring::from_utf8(out->c_str());
}
+/*
+ * text_box_font_info object for text_box
+ */
+text_box_font_info_data::~text_box_font_info_data() = default;
+
/*
* text_box object for page::text_list()
*/
@@ -352,30 +357,41 @@ bool text_box::has_space_after() const
return m_data->has_space_after;
}
+bool text_box::has_font_info() const
+{
+ return (m_data->text_box_font != nullptr);
+}
+
text_box::writing_mode_enum text_box::get_wmode(int i) const
{
- return m_data->wmodes[i];
+ if (this->has_font_info())
+ return m_data->text_box_font->wmodes[i];
+ else
+ return text_box::invalid_wmode;
}
double text_box::get_font_size() const
{
- return m_data->font_size;
+ if (this->has_font_info())
+ return m_data->text_box_font->font_size;
+ else
+ return -1;
}
std::string text_box::get_font_name(int i) const
{
- int j = m_data->glyph_to_cache_index[i];
+ if (!this->has_font_info())
+ return std::string("*ignored*");
+
+ int j = m_data->text_box_font->glyph_to_cache_index[i];
if (j < 0) {
return std::string("");
}
- return m_data->font_info_cache[j].name();
+ return m_data->text_box_font->font_info_cache[j].name();
}
-
-std::vector<text_box> page::text_list() const
+std::vector<text_box> page::text_list(int opt_flag) const
{
- d->init_font_info_cache();
-
std::vector<text_box> output_list;
/* config values are same with Qt5 Page::TextList() */
@@ -419,41 +435,55 @@ std::vector<text_box> page::text_list() const
word->getRotation(),
{},
word->hasSpaceAfter() == true,
- {},
- word->getFontSize(),
- d->font_info_cache,
- {}
+ nullptr
}};
+ std::unique_ptr<text_box_font_info_data> tb_font_info = nullptr;
+ if (opt_flag & page::text_list_include_font) {
+ d->init_font_info_cache();
+
+ std::unique_ptr<text_box_font_info_data> tb_font{new text_box_font_info_data{
+ word->getFontSize(), // double font_size
+ {}, // std::vector<text_box::writing_mode> wmodes;
+ d->font_info_cache, // std::vector<font_info> font_info_cache;
+ {} // std::vector<int> glyph_to_cache_index;
+ }};
+
+ tb_font_info = std::move(tb_font);
+ };
+
tb.m_data->char_bboxes.reserve(word->getLength());
for (int j = 0; j < word->getLength(); j ++) {
word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax);
tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin);
}
- tb.m_data->glyph_to_cache_index.reserve(word->getLength());
- for (int j = 0; j < word->getLength(); j++) {
- const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
-
- // filter-out the invalid WMode value here.
- switch (cur_text_font_info->getWMode()) {
- case 0:
- tb.m_data->wmodes.push_back(text_box::horizontal_wmode);
- break;
- case 1:
- tb.m_data->wmodes.push_back(text_box::vertical_wmode);
- break;
- default:
- tb.m_data->wmodes.push_back(text_box::invalid_wmode);
- };
-
- tb.m_data->glyph_to_cache_index[j] = -1;
- for (size_t k = 0; k < d->font_info_cache.size(); k++) {
- if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) {
- tb.m_data->glyph_to_cache_index[j] = k;
+ if (tb_font_info && d->font_info_cache_initialized) {
+ tb_font_info->glyph_to_cache_index.reserve(word->getLength());
+ for (int j = 0; j < word->getLength(); j++) {
+ const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
+
+ // filter-out the invalid WMode value here.
+ switch (cur_text_font_info->getWMode()) {
+ case 0:
+ tb_font_info->wmodes.push_back(text_box::horizontal_wmode);
break;
+ case 1:
+ tb_font_info->wmodes.push_back(text_box::vertical_wmode);
+ break;
+ default:
+ tb_font_info->wmodes.push_back(text_box::invalid_wmode);
+ };
+
+ tb_font_info->glyph_to_cache_index[j] = -1;
+ for (size_t k = 0; k < tb_font_info->font_info_cache.size(); k++) {
+ if (cur_text_font_info->matches(&(tb_font_info->font_info_cache[k].d->ref))) {
+ tb_font_info->glyph_to_cache_index[j] = k;
+ break;
+ }
}
}
+ tb.m_data->text_box_font = std::move(tb_font_info);
}
output_list.push_back(std::move(tb));
@@ -462,3 +492,8 @@ std::vector<text_box> page::text_list() const
return output_list;
}
+
+std::vector<text_box> page::text_list() const
+{
+ return text_list(0);
+}
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index ca5be2fd..dd6ebf2c 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -66,6 +66,12 @@ public:
rectf char_bbox(size_t i) const;
bool has_space_after() const;
+
+ /**
+ \since 0.89
+ */
+ bool has_font_info() const;
+
/**
Get a writing mode for the i-th glyph
@@ -186,6 +192,22 @@ public:
*/
std::vector<text_box> text_list() const;
+ /*
+ * text_list_option_enum is a bitmask-style flags for text_list(),
+ * 0 means the default & simplest behaviour.
+ */
+ enum text_list_option_enum {
+ text_list_include_font = 1 // \since 0.89
+ };
+
+ /**
+ Extended version of text_list() taking an option flag.
+ The option flag should be the multiple of text_list_option_enum.
+
+ \since 0.89
+ */
+ std::vector<text_box> text_list(int opt_flag) const;
+
private:
page(document_private *doc, int index);
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 83e46319..4ec159a8 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -73,23 +73,17 @@ void delete_all(const Collection &c)
}
class font_info;
-struct text_box_data
+struct text_box_font_info_data
{
- ~text_box_data();
-
- ustring text;
- rectf bbox;
- int rotation;
- std::vector<rectf> char_bboxes;
- bool has_space_after;
+ ~text_box_font_info_data();
- std::vector<text_box::writing_mode_enum> wmodes;
double font_size;
+ std::vector<text_box::writing_mode_enum> wmodes;
/*
* a duplication of the font_info_cache created by the
* poppler::font_iterator and owned by the poppler::page
- * object. Its lifetime might differ from that of text_box
+ * object. Its lifetime might differ from that of text_box
* object (think about collecting all text_box objects
* from all pages), so we have to duplicate it into all
* text_box instances.
@@ -97,7 +91,7 @@ struct text_box_data
std::vector<font_info> font_info_cache;
/*
- * a std::vector from the glyph index in the current
+ * a std::vector from the glyph index in the owner
* text_box to the font_info index in font_info_cache.
* The "-1" means no corresponding fonts found in the
* cache.
@@ -105,6 +99,20 @@ struct text_box_data
std::vector<int> glyph_to_cache_index;
};
+class font_info;
+struct text_box_data
+{
+ ~text_box_data();
+
+ ustring text;
+ rectf bbox;
+ int rotation;
+ std::vector<rectf> char_bboxes;
+ bool has_space_after;
+
+ std::unique_ptr<text_box_font_info_data> text_box_font;
+};
+
}
#endif
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index 7864979e..ef391d78 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -60,6 +60,7 @@ bool show_help = false;
bool show_version = false;
char show_text[32];
bool show_text_list = false;
+bool show_text_list_with_font = false;
poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
static const ArgDesc the_args[] = {
@@ -85,6 +86,8 @@ static const ArgDesc the_args[] = {
"show text (physical|raw|none) extracted from all pages" },
{ "--show-text-list", argFlag, &show_text_list, 0,
"show text list (experimental)" },
+ { "--show-text-list-with-font", argFlag, &show_text_list_with_font, 0,
+ "show text list with font info (experimental)" },
{ "-h", argFlag, &show_help, 0,
"print usage information" },
{ "--help", argFlag, &show_help, 0,
@@ -417,14 +420,14 @@ static void print_page_text(poppler::page *p)
std::cout << std::endl;
}
-static void print_page_text_list(poppler::page *p)
+static void print_page_text_list(poppler::page *p, int opt_flag = 0)
{
if (!p) {
std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
std::cout << std::endl;
return;
}
- auto text_list = p->text_list();
+ auto text_list = p->text_list(opt_flag);
std::cout << "---" << std::endl;
for (const poppler::text_box &text : text_list) {
@@ -435,9 +438,9 @@ static void print_page_text_list(poppler::page *p)
std::string font_name = text.get_font_name();
std::cout << "[" << ustr << "] @ ";
std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
- std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
+ if (text.has_font_info())
+ std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
std::cout << std::endl;
-
}
std::cout << "---" << std::endl;
}
@@ -538,12 +541,15 @@ int main(int argc, char *argv[])
print_page_text(p.get());
}
}
- if (show_text_list) {
+ if (show_text_list || show_text_list_with_font) {
const int pages = doc->pages();
for (int i = 0; i < pages; ++i) {
std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
std::unique_ptr<poppler::page> p(doc->create_page(i));
- print_page_text_list(p.get());
+ if (show_text_list_with_font)
+ print_page_text_list(p.get(), poppler::page::text_list_include_font);
+ else
+ print_page_text_list(p.get(), 0);
}
}
commit 437553ecb26948f77c3dbf7ad29bca86ffff7f6e
Author: Albert Astals Cid <aacid at kde.org>
Date: Fri May 15 12:57:32 2020 +0000
[cpp] change page_private::init_font_info_cache() to a void method.
We already have a boolean font_info_cache_initialized, no need to
guess the initialization result by the size of initialized cache.
diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h
index d4954e9d..442f8bb1 100644
--- a/cpp/poppler-page-private.h
+++ b/cpp/poppler-page-private.h
@@ -50,7 +50,7 @@ public:
std::vector<font_info> font_info_cache;
bool font_info_cache_initialized;
- size_t init_font_info_cache();
+ void init_font_info_cache();
};
}
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index e44ef26e..f274ca5b 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -57,10 +57,10 @@ page_private::~page_private()
delete transition;
}
-size_t page_private::init_font_info_cache()
+void page_private::init_font_info_cache()
{
if (font_info_cache_initialized)
- return font_info_cache.size();
+ return;
poppler::font_iterator it(index, doc);
@@ -69,7 +69,7 @@ size_t page_private::init_font_info_cache()
}
font_info_cache_initialized = true;
- return font_info_cache.size();
+ return;
}
/**
commit 57de32198a4406eae18b80eed42e6050e2b48cca
Author: Albert Astals Cid <aacid at kde.org>
Date: Fri May 15 12:23:50 2020 +0000
[cpp] in poppler-page.h, add "since 0.89" comment to 3 new methods.
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 50ccdb06..ca5be2fd 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -66,10 +66,6 @@ public:
rectf char_bbox(size_t i) const;
bool has_space_after() const;
- /**
- \since 0.8x
- */
-
/**
Get a writing mode for the i-th glyph
@@ -85,6 +81,10 @@ public:
horizontal_wmode = 0,
vertical_wmode = 1
};
+
+ /**
+ \since 0.89
+ */
writing_mode_enum get_wmode(int i = 0) const;
/**
@@ -93,6 +93,10 @@ public:
This method return a double floating value of the
font size from the text_box instance.
*/
+
+ /**
+ \since 0.89
+ */
double get_font_size() const;
/**
@@ -111,6 +115,10 @@ public:
Latin1 or UTF-8. Some legacy PDF producers used
in CJK market use GBK, Big5, Wansung or Shift-JIS.
*/
+
+ /**
+ \since 0.89
+ */
std::string get_font_name(int i = 0) const;
private:
commit 507027de297f43146f5bbebe8d098dededffc577
Author: suzuki toshiya <mpsuzuki at hiroshima-u.ac.jp>
Date: Tue May 5 10:11:49 2020 +0000
[cpp] introduce a boolean font_info_cache_initialized, to distinguish an initialized-but-empty cache from the uninitialized cache
Co-authored-by: Adam Reichold <adam.reichold at t-online.de>
diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h
index 3e2ee914..d4954e9d 100644
--- a/cpp/poppler-page-private.h
+++ b/cpp/poppler-page-private.h
@@ -49,6 +49,7 @@ public:
{ return const_cast<poppler::page *>(p)->d; }
std::vector<font_info> font_info_cache;
+ bool font_info_cache_initialized;
size_t init_font_info_cache();
};
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index b0bf847e..e44ef26e 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -48,6 +48,7 @@ page_private::page_private(document_private *_doc, int _index)
, page(doc->doc->getCatalog()->getPage(_index + 1))
, index(_index)
, transition(nullptr)
+ , font_info_cache_initialized(false)
{
}
@@ -58,7 +59,7 @@ page_private::~page_private()
size_t page_private::init_font_info_cache()
{
- if (font_info_cache.size() > 0)
+ if (font_info_cache_initialized)
return font_info_cache.size();
poppler::font_iterator it(index, doc);
@@ -67,6 +68,7 @@ size_t page_private::init_font_info_cache()
font_info_cache = it.next();
}
+ font_info_cache_initialized = true;
return font_info_cache.size();
}
commit 2cd79c7382888559d5d8dcc56a84572ac8a77086
Author: Adam Reichold <adam.reichold at t-online.de>
Date: Tue May 5 01:22:29 2020 +0000
[cpp] construct a font_iterator instance in the local storage of page_private::init_font_info_cache() method, instead of the heap
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 3aa6222b..b0bf847e 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -61,14 +61,12 @@ size_t page_private::init_font_info_cache()
if (font_info_cache.size() > 0)
return font_info_cache.size();
- poppler::font_iterator* font_iterator = new poppler::font_iterator(index, doc);
+ poppler::font_iterator it(index, doc);
- if (font_iterator->has_next()) {
- font_info_cache = font_iterator->next();
+ if (it.has_next()) {
+ font_info_cache = it.next();
}
- delete font_iterator;
-
return font_info_cache.size();
}
commit 7279b4eb397667cd4553f5852286b3f3d73a1a83
Author: Adam Reichold <adam.reichold at t-online.de>
Date: Mon May 4 11:51:55 2020 +0000
[cpp] remove wrong warning note for about the std::string object returned by text_box::get_font_name()
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 9db6f87b..50ccdb06 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -110,10 +110,6 @@ public:
encoding of the font name is one of the ASCII,
Latin1 or UTF-8. Some legacy PDF producers used
in CJK market use GBK, Big5, Wansung or Shift-JIS.
-
- \warning The returned std::string is owned by the
- text_box instance, it should not be used in the
- other objects or should not be destroyed directly.
*/
std::string get_font_name(int i = 0) const;
commit af3805f0b60289c7f522da29f9375119a1cd778a
Author: Albert Astals Cid <aacid at kde.org>
Date: Mon May 4 04:32:27 2020 +0000
[cpp] new enum poppler::text_box::writing_mode_enum
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 715c5ec4..3aa6222b 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -352,7 +352,7 @@ bool text_box::has_space_after() const
return m_data->has_space_after;
}
-int text_box::get_wmode(int i) const
+text_box::writing_mode_enum text_box::get_wmode(int i) const
{
return m_data->wmodes[i];
}
@@ -434,7 +434,18 @@ std::vector<text_box> page::text_list() const
tb.m_data->glyph_to_cache_index.reserve(word->getLength());
for (int j = 0; j < word->getLength(); j++) {
const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
- tb.m_data->wmodes.push_back(cur_text_font_info->getWMode());
+
+ // filter-out the invalid WMode value here.
+ switch (cur_text_font_info->getWMode()) {
+ case 0:
+ tb.m_data->wmodes.push_back(text_box::horizontal_wmode);
+ break;
+ case 1:
+ tb.m_data->wmodes.push_back(text_box::vertical_wmode);
+ break;
+ default:
+ tb.m_data->wmodes.push_back(text_box::invalid_wmode);
+ };
tb.m_data->glyph_to_cache_index[j] = -1;
for (size_t k = 0; k < d->font_info_cache.size(); k++) {
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 6f9e755d..9db6f87b 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -73,17 +73,19 @@ public:
/**
Get a writing mode for the i-th glyph
- This method returns an integer of the writing mode
+ This method returns an enum of the writing mode
for the i-th glyph in the text_box.
- 0 means the horizontal writing mode.
- 1 means the vertical writing mode.
-
\note Usually all glyphs in one text_box have the
same writing mode. Thus the default value of the
glyph index is 0.
*/
- int get_wmode(int i = 0) const;
+ enum writing_mode_enum {
+ invalid_wmode = -1,
+ horizontal_wmode = 0,
+ vertical_wmode = 1
+ };
+ writing_mode_enum get_wmode(int i = 0) const;
/**
Get a font size of this text_box instance.
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 0fe33d3f..83e46319 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -28,6 +28,7 @@
#include "poppler-global.h"
#include "poppler-rectangle.h"
+#include "poppler-page.h" // to use text_box::writing_mode_enum
#include "Error.h"
#include "CharTypes.h"
@@ -82,7 +83,7 @@ struct text_box_data
std::vector<rectf> char_bboxes;
bool has_space_after;
- std::vector<int> wmodes;
+ std::vector<text_box::writing_mode_enum> wmodes;
double font_size;
/*
commit 65053f43dbb83b66302bddda27732168fc74cca1
Author: Albert Astals Cid <aacid at kde.org>
Date: Sun May 3 16:21:38 2020 +0000
[TextOutputDev] simplify TextFontInfo::matches(const Ref *ref)
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 84af8af2..993a37da 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -342,7 +342,7 @@ bool TextFontInfo::matches(const TextFontInfo *fontInfo) const {
}
bool TextFontInfo::matches(const Ref *ref) const {
- return (gfxFont->getID()->num == ref->num && gfxFont->getID()->gen == ref->gen);
+ return (*(gfxFont->getID()) == *ref);
}
double TextFontInfo::getAscent() const {
commit 4ea2e879d4e0e9a5d899adb82bbdaab9e505532c
Author: Albert Astals Cid <aacid at kde.org>
Date: Sun May 3 16:17:11 2020 +0000
[cpp] simplify the initialization of poppler::font_info_private.ref and .emb_ref
diff --git a/cpp/poppler-font-private.h b/cpp/poppler-font-private.h
index b24cbaf0..aa26e2f9 100644
--- a/cpp/poppler-font-private.h
+++ b/cpp/poppler-font-private.h
@@ -48,10 +48,8 @@ public:
font_file = fi->getFile()->c_str();
}
- ref.num = fi->getRef().num;
- ref.gen = fi->getRef().gen;
- emb_ref.num = fi->getEmbRef().num;
- emb_ref.gen = fi->getEmbRef().gen;
+ ref = fi->getRef();
+ emb_ref = fi->getEmbRef();
}
std::string font_name;
commit 60400514324d6e5d0a1c50ce4af84320d350e967
Author: suzuki toshiya <mpsuzuki at hiroshima-u.ac.jp>
Date: Fri May 1 08:04:14 2020 +0000
[cpp] Add the font infos to the text_box object.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 627920ff..32b3ef88 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -38,6 +38,7 @@ install(FILES
poppler-document.h
poppler-embedded-file.h
poppler-font.h
+ poppler-font-private.h
poppler-global.h
poppler-image.h
poppler-page.h
diff --git a/cpp/poppler-font-private.h b/cpp/poppler-font-private.h
new file mode 100644
index 00000000..b24cbaf0
--- /dev/null
+++ b/cpp/poppler-font-private.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2009, Pino Toscano <pino at kde.org>
+ * Copyright (C) 2015, Tamas Szekeres <szekerest at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "poppler-font.h"
+
+#include "poppler-document-private.h"
+
+#include "FontInfo.h"
+
+#include <algorithm>
+
+using namespace poppler;
+
+class poppler::font_info_private
+{
+public:
+ font_info_private()
+ : type(font_info::unknown)
+ , is_embedded(false)
+ , is_subset(false)
+ {
+ }
+ font_info_private(FontInfo *fi)
+ : type((font_info::type_enum)fi->getType())
+ , is_embedded(fi->getEmbedded())
+ , is_subset(fi->getSubset())
+ {
+ if (fi->getName()) {
+ font_name = fi->getName()->c_str();
+ }
+ if (fi->getFile()) {
+ font_file = fi->getFile()->c_str();
+ }
+
+ ref.num = fi->getRef().num;
+ ref.gen = fi->getRef().gen;
+ emb_ref.num = fi->getEmbRef().num;
+ emb_ref.gen = fi->getEmbRef().gen;
+ }
+
+ std::string font_name;
+ std::string font_file;
+ font_info::type_enum type : 5;
+ bool is_embedded : 1;
+ bool is_subset : 1;
+
+ Ref ref;
+ Ref emb_ref;
+};
+
+
+class poppler::font_iterator_private
+{
+public:
+ font_iterator_private(int start_page, document_private *dd)
+ : font_info_scanner(dd->doc, start_page)
+ , total_pages(dd->doc->getNumPages())
+ , current_page((std::max)(start_page, 0))
+ {
+ }
+ ~font_iterator_private()
+ {
+ }
+
+ FontInfoScanner font_info_scanner;
+ int total_pages;
+ int current_page;
+};
diff --git a/cpp/poppler-font.cpp b/cpp/poppler-font.cpp
index 6d833c19..e8a4076f 100644
--- a/cpp/poppler-font.cpp
+++ b/cpp/poppler-font.cpp
@@ -24,6 +24,8 @@
*/
#include "poppler-font.h"
+#include "poppler-font-private.h"
+
#include "poppler-document-private.h"
#include "FontInfo.h"
@@ -32,54 +34,6 @@
using namespace poppler;
-class poppler::font_info_private
-{
-public:
- font_info_private()
- : type(font_info::unknown)
- , is_embedded(false)
- , is_subset(false)
- {
- }
- font_info_private(FontInfo *fi)
- : type((font_info::type_enum)fi->getType())
- , is_embedded(fi->getEmbedded())
- , is_subset(fi->getSubset())
- {
- if (fi->getName()) {
- font_name = fi->getName()->c_str();
- }
- if (fi->getFile()) {
- font_file = fi->getFile()->c_str();
- }
- }
-
- std::string font_name;
- std::string font_file;
- font_info::type_enum type : 5;
- bool is_embedded : 1;
- bool is_subset : 1;
-};
-
-
-class poppler::font_iterator_private
-{
-public:
- font_iterator_private(int start_page, document_private *dd)
- : font_info_scanner(dd->doc, start_page)
- , total_pages(dd->doc->getNumPages())
- , current_page((std::max)(start_page, 0))
- {
- }
- ~font_iterator_private()
- {
- }
-
- FontInfoScanner font_info_scanner;
- int total_pages;
- int current_page;
-};
-
/**
\class poppler::font_info poppler-font.h "poppler/cpp/poppler-font.h"
@@ -208,7 +162,7 @@ font_iterator::~font_iterator()
}
/**
- Returns the fonts of the current page and advances to the next one.
+ \returns the fonts of the current page and advances to the next one.
*/
std::vector<font_info> font_iterator::next()
{
@@ -218,6 +172,10 @@ std::vector<font_info> font_iterator::next()
++d->current_page;
+ /* FontInfoScanner::scan() receives a number how many pages to
+ * be scanned from the *current page*, not from the beginning.
+ * We restrict the font scanning to the current page only.
+ */
const std::vector<FontInfo*> items = d->font_info_scanner.scan(1);
std::vector<font_info> fonts;
fonts.reserve(items.size());
diff --git a/cpp/poppler-font.h b/cpp/poppler-font.h
index 854b7a40..27667e78 100644
--- a/cpp/poppler-font.h
+++ b/cpp/poppler-font.h
@@ -67,6 +67,7 @@ private:
font_info_private *d;
friend class font_iterator;
+ friend class page;
};
@@ -84,6 +85,8 @@ private:
font_iterator_private *d;
friend class document;
+ friend class page;
+ friend class page_private;
};
}
diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h
index e0c3446d..3e2ee914 100644
--- a/cpp/poppler-page-private.h
+++ b/cpp/poppler-page-private.h
@@ -29,6 +29,7 @@ namespace poppler
class document_private;
class page_transition;
+class font_info;
class page_private
{
@@ -46,6 +47,9 @@ public:
static inline page_private* get(const poppler::page *p)
{ return const_cast<poppler::page *>(p)->d; }
+
+ std::vector<font_info> font_info_cache;
+ size_t init_font_info_cache();
};
}
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 7150cd78..715c5ec4 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -32,6 +32,8 @@
#include "poppler-document-private.h"
#include "poppler-page-private.h"
#include "poppler-private.h"
+#include "poppler-font-private.h"
+#include "poppler-font.h"
#include "TextOutputDev.h"
@@ -54,6 +56,22 @@ page_private::~page_private()
delete transition;
}
+size_t page_private::init_font_info_cache()
+{
+ if (font_info_cache.size() > 0)
+ return font_info_cache.size();
+
+ poppler::font_iterator* font_iterator = new poppler::font_iterator(index, doc);
+
+ if (font_iterator->has_next()) {
+ font_info_cache = font_iterator->next();
+ }
+
+ delete font_iterator;
+
+ return font_info_cache.size();
+}
+
/**
\class poppler::page poppler-page.h "poppler/cpp/poppler-page.h"
@@ -334,17 +352,39 @@ bool text_box::has_space_after() const
return m_data->has_space_after;
}
+int text_box::get_wmode(int i) const
+{
+ return m_data->wmodes[i];
+}
+
+double text_box::get_font_size() const
+{
+ return m_data->font_size;
+}
+
+std::string text_box::get_font_name(int i) const
+{
+ int j = m_data->glyph_to_cache_index[i];
+ if (j < 0) {
+ return std::string("");
+ }
+ return m_data->font_info_cache[j].name();
+}
+
+
std::vector<text_box> page::text_list() const
{
+ d->init_font_info_cache();
+
std::vector<text_box> output_list;
/* config values are same with Qt5 Page::TextList() */
auto output_dev = std::make_unique<TextOutputDev>(
- nullptr, /* char* fileName */
- false, /* bool physLayoutA */
+ nullptr, /* char* fileName */
+ false, /* bool physLayoutA */
0, /* double fixedPitchA */
- false, /* bool rawOrderA */
- false /* bool append */
+ false, /* bool rawOrderA */
+ false /* bool append */
);
/*
@@ -378,7 +418,11 @@ std::vector<text_box> page::text_list() const
{xMin, yMin, xMax-xMin, yMax-yMin},
word->getRotation(),
{},
- word->hasSpaceAfter() == true
+ word->hasSpaceAfter() == true,
+ {},
+ word->getFontSize(),
+ d->font_info_cache,
+ {}
}};
tb.m_data->char_bboxes.reserve(word->getLength());
@@ -387,6 +431,20 @@ std::vector<text_box> page::text_list() const
tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin);
}
+ tb.m_data->glyph_to_cache_index.reserve(word->getLength());
+ for (int j = 0; j < word->getLength(); j++) {
+ const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
+ tb.m_data->wmodes.push_back(cur_text_font_info->getWMode());
+
+ tb.m_data->glyph_to_cache_index[j] = -1;
+ for (size_t k = 0; k < d->font_info_cache.size(); k++) {
+ if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) {
+ tb.m_data->glyph_to_cache_index[j] = k;
+ break;
+ }
+ }
+ }
+
output_list.push_back(std::move(tb));
}
}
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 30ede302..6f9e755d 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -65,6 +65,56 @@ public:
*/
rectf char_bbox(size_t i) const;
bool has_space_after() const;
+
+ /**
+ \since 0.8x
+ */
+
+ /**
+ Get a writing mode for the i-th glyph
+
+ This method returns an integer of the writing mode
+ for the i-th glyph in the text_box.
+
+ 0 means the horizontal writing mode.
+ 1 means the vertical writing mode.
+
+ \note Usually all glyphs in one text_box have the
+ same writing mode. Thus the default value of the
+ glyph index is 0.
+ */
+ int get_wmode(int i = 0) const;
+
+ /**
+ Get a font size of this text_box instance.
+
+ This method return a double floating value of the
+ font size from the text_box instance.
+ */
+ double get_font_size() const;
+
+ /**
+ Get a font name for the i-th glyph
+
+ This method returns a std::string object holding
+ the font name for the i-th glyph.
+
+ \note The randomization prefix of the embedded fonts
+ are not removed. The font names including these
+ prefixes are insuffucient to determine whether the
+ two fonts are same or different.
+
+ \note The clients should not assume that the
+ encoding of the font name is one of the ASCII,
+ Latin1 or UTF-8. Some legacy PDF producers used
+ in CJK market use GBK, Big5, Wansung or Shift-JIS.
+
+ \warning The returned std::string is owned by the
+ text_box instance, it should not be used in the
+ other objects or should not be destroyed directly.
+ */
+ std::string get_font_name(int i = 0) const;
+
private:
text_box(text_box_data *data);
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index b9bc9b52..0fe33d3f 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -71,6 +71,7 @@ void delete_all(const Collection &c)
delete_all(c.begin(), c.end());
}
+class font_info;
struct text_box_data
{
~text_box_data();
@@ -80,6 +81,27 @@ struct text_box_data
int rotation;
std::vector<rectf> char_bboxes;
bool has_space_after;
+
+ std::vector<int> wmodes;
+ double font_size;
+
+ /*
+ * a duplication of the font_info_cache created by the
+ * poppler::font_iterator and owned by the poppler::page
+ * object. Its lifetime might differ from that of text_box
+ * object (think about collecting all text_box objects
+ * from all pages), so we have to duplicate it into all
+ * text_box instances.
+ */
+ std::vector<font_info> font_info_cache;
+
+ /*
+ * a std::vector from the glyph index in the current
+ * text_box to the font_info index in font_info_cache.
+ * The "-1" means no corresponding fonts found in the
+ * cache.
+ */
+ std::vector<int> glyph_to_cache_index;
};
}
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index 6196b675..7864979e 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -430,8 +430,12 @@ static void print_page_text_list(poppler::page *p)
for (const poppler::text_box &text : text_list) {
poppler::rectf bbox = text.bbox();
poppler::ustring ustr = text.text();
+ int wmode = text.get_wmode();
+ double font_size = text.get_font_size();
+ std::string font_name = text.get_font_name();
std::cout << "[" << ustr << "] @ ";
std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
+ std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
std::cout << std::endl;
}
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index c7c8f852..84af8af2 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -341,6 +341,10 @@ bool TextFontInfo::matches(const TextFontInfo *fontInfo) const {
return gfxFont == fontInfo->gfxFont;
}
+bool TextFontInfo::matches(const Ref *ref) const {
+ return (gfxFont->getID()->num == ref->num && gfxFont->getID()->gen == ref->gen);
+}
+
double TextFontInfo::getAscent() const {
return gfxFont ? gfxFont->getAscent() : 0.95;
}
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 0d008b3d..62c95b0f 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -91,6 +91,7 @@ public:
bool matches(const GfxState *state) const;
bool matches(const TextFontInfo *fontInfo) const;
+ bool matches(const Ref *ref) const;
// Get the font ascent, or a default value if the font is not set
double getAscent() const;
More information about the poppler
mailing list