[poppler] 2 commits - cpp/poppler-page.cpp cpp/poppler-page.h cpp/poppler-private.h cpp/tests
Albert Astals Cid
aacid at kemper.freedesktop.org
Mon Feb 26 23:45:18 UTC 2018
cpp/poppler-page.cpp | 91 +++++++++++++++++++++++++++++++++++++++++++++
cpp/poppler-page.h | 58 ++++++++++++++++++++++++++++
cpp/poppler-private.h | 9 ++++
cpp/tests/poppler-dump.cpp | 34 ++++++++++++++++
4 files changed, 192 insertions(+)
New commits:
commit 2740b3aca81a6a8c690540fc141e5923a1fff460
Author: Albert Astals Cid <aacid at kde.org>
Date: Tue Feb 27 00:47:04 2018 +0100
cpp: Add since
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index df5cb36a..93a13d18 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -1,6 +1,7 @@
/*
* Copyright (C) 2009-2010, Pino Toscano <pino at kde.org>
* Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
+ * Copyright (C) 2018, Albert Astals Cid <aacid at kde.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -111,6 +112,8 @@ public:
up-to-down), the std::vector contains the text in the proper
order.
+ \since 0.63
+
\note The page object owns the text_box objects as unique_ptr,
the caller is not needed to free them.
commit 42a6b8651f040f0960802e705b1aea82a956a63b
Author: suzuki toshiya <mpsuzuki at hiroshima-u.ac.jp>
Date: Tue Feb 27 00:46:18 2018 +0100
cpp: Add page::text_list
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 8913c8eb..83d48f07 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -2,6 +2,7 @@
* Copyright (C) 2009-2010, Pino Toscano <pino at kde.org>
* Copyright (C) 2017, Albert Astals Cid <aacid at kde.org>
* Copyright (C) 2017, Jason Alan Palmer <jalanpalmer at gmail.com>
+ * Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -285,3 +286,93 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
}
return ustring::from_utf8(s->getCString());
}
+
+/*
+ * text_box object for page::text_list()
+ */
+text_box::~text_box() = default;
+
+text_box::text_box(text_box_data *data) : m_data{data}
+{
+}
+
+ustring text_box::text() const
+{
+ return m_data->text;
+}
+
+rectf text_box::bbox() const
+{
+ return m_data->bbox;
+}
+
+rectf text_box::char_bbox(size_t i) const
+{
+ if (i < m_data->char_bboxes.size())
+ return m_data->char_bboxes[i];
+ return rectf(0, 0, 0, 0);
+}
+
+bool text_box::has_space_after() const
+{
+ return m_data->has_space_after;
+}
+
+std::vector<text_box> page::text_list() const
+{
+ std::vector<text_box> output_list;
+
+ /* config values are same with Qt5 Page::TextList() */
+ std::unique_ptr<TextOutputDev> output_dev{
+ new TextOutputDev(nullptr, /* char* fileName */
+ gFalse, /* GBool physLayoutA */
+ 0, /* double fixedPitchA */
+ gFalse, /* GBool rawOrderA */
+ gFalse) /* GBool append */
+ };
+
+ /*
+ * config values are same with Qt5 Page::TextList(),
+ * but rotation is fixed to zero.
+ * Few people use non-zero values.
+ */
+ d->doc->doc->displayPageSlice(output_dev.get(),
+ d->index + 1, /* page */
+ 72, 72, 0, /* hDPI, vDPI, rot */
+ gFalse, gFalse, gFalse, /* useMediaBox, crop, printing */
+ -1, -1, -1, -1, /* sliceX, sliceY, sliceW, sliceH */
+ nullptr, nullptr, /* abortCheckCbk(), abortCheckCbkData */
+ nullptr, nullptr, /* annotDisplayDecideCbk(), annotDisplayDecideCbkData */
+ gTrue); /* copyXRef */
+
+ if (std::unique_ptr< TextWordList > word_list{output_dev->makeWordList()}) {
+
+ output_list.reserve(word_list->getLength());
+ for (int i = 0; i < word_list->getLength(); i ++) {
+ TextWord *word = word_list->get(i);
+
+ std::unique_ptr<GooString> gooWord{word->getText()};
+ ustring ustr = detail::unicode_GooString_to_ustring(gooWord.get());
+
+ double xMin, yMin, xMax, yMax;
+ word->getBBox(&xMin, &yMin, &xMax, &yMax);
+
+ text_box tb{new text_box_data{
+ ustr,
+ {xMin, yMin, xMax-xMin, yMax-yMin},
+ {},
+ word->hasSpaceAfter() == gTrue
+ }};
+
+ tb.m_data->char_bboxes.reserve(word->getLength());
+ for (int j = 0; j < word->getLength(); j ++) {
+ word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax);
+ tb.m_data->char_bboxes.push_back({xMin, yMin, xMax-xMin, yMax-yMin});
+ }
+
+ output_list.push_back(std::move(tb));
+ }
+ }
+
+ return output_list;
+}
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 7b4298a1..df5cb36a 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009-2010, Pino Toscano <pino at kde.org>
+ * Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -22,9 +23,45 @@
#include "poppler-global.h"
#include "poppler-rectangle.h"
+#include <memory>
+
namespace poppler
{
+struct text_box_data;
+class POPPLER_CPP_EXPORT text_box
+{
+ friend class page;
+public:
+ text_box(text_box&&) = default;
+ text_box& operator=(text_box&&) = default;
+
+ ~text_box();
+
+ ustring text() const;
+ rectf bbox() const;
+
+ /**
+ Get a bbox for the i-th glyph
+
+ This method returns a rectf of the bounding box for
+ the i-th glyph in the text_box.
+
+ \note The text_box object owns the rectf objects,
+ the caller is not needed to free them.
+
+ \warning For too large glyph index, rectf(0,0,0,0)
+ is returned. The number of the glyphs and ustring
+ codepoints might be different in some complex scripts.
+ */
+ rectf char_bbox(size_t i) const;
+ bool has_space_after() const;
+private:
+ text_box(text_box_data *data);
+
+ std::unique_ptr<text_box_data> m_data;
+};
+
class document;
class document_private;
class page_private;
@@ -63,6 +100,24 @@ public:
ustring text(const rectf &rect = rectf()) const;
ustring text(const rectf &rect, text_layout_enum layout_mode) const;
+ /**
+ Returns a list of text of the page
+
+ This method returns a std::vector of text_box that contain all
+ the text of the page, with roughly one text word of text
+ per text_box item.
+
+ For text written in western languages (left-to-right and
+ up-to-down), the std::vector contains the text in the proper
+ order.
+
+ \note The page object owns the text_box objects as unique_ptr,
+ the caller is not needed to free them.
+
+ \warning This method is not tested with Asian scripts
+ */
+ std::vector<text_box> text_list() const;
+
private:
page(document_private *doc, int index);
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 147073d9..3753567f 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -3,6 +3,7 @@
* Copyright (C) 2013 Adrian Johnson <ajohnson at redneon.com>
* Copyright (C) 2014, Hans-Peter Deifel <hpdeifel at gmx.de>
* Copyright (C) 2016 Jakub Alba <jakubalba at gmail.com>
+ * Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -67,6 +68,14 @@ void delete_all(const Collection &c)
delete_all(c.begin(), c.end());
}
+struct text_box_data
+{
+ ustring text;
+ rectf bbox;
+ std::vector<rectf> char_bboxes;
+ bool has_space_after;
+};
+
}
#endif
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index c147aad7..a1a68251 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -2,6 +2,7 @@
* Copyright (C) 2009-2010, Pino Toscano <pino at kde.org>
* Copyright (C) 2017, 2018, Albert Astals Cid <aacid at kde.org>
* Copyright (C) 2017, Jason Alan Palmer <jalanpalmer at gmail.com>
+ * Copyright (C) 2018, Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -50,6 +51,7 @@ bool show_embedded_files = false;
bool show_pages = false;
bool show_help = false;
char show_text[32];
+bool show_text_list = false;
poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
static const ArgDesc the_args[] = {
@@ -71,6 +73,8 @@ static const ArgDesc the_args[] = {
"show pages information" },
{ "--show-text", argString, &show_text, sizeof(show_text),
"show text (physical|raw) extracted from all pages" },
+ { "--show-text-list", argFlag, &show_text_list, 0,
+ "show text list (experimental)" },
{ "-h", argFlag, &show_help, 0,
"print usage information" },
{ "--help", argFlag, &show_help, 0,
@@ -323,6 +327,28 @@ static void print_page_text(poppler::page *p)
std::cout << std::endl;
}
+static void print_page_text_list(poppler::page *p)
+{
+ if (!p) {
+ std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
+ std::cout << std::endl;
+ return;
+ }
+ auto text_list = p->text_list();
+
+ std::cout << "---" << std::endl;
+ for (size_t i = 0; i < text_list.size(); i ++) {
+ poppler::rectf bbox = text_list[i].bbox();
+ poppler::ustring ustr = text_list[i].text();
+ std::cout << "[" << ustr << "] @ ";
+ std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
+ std::cout << std::endl;
+
+ }
+ std::cout << "---" << std::endl;
+}
+
+
int main(int argc, char *argv[])
{
if (!parseArgs(the_args, &argc, argv)
@@ -398,6 +424,14 @@ int main(int argc, char *argv[])
print_page_text(p.get());
}
}
+ if (show_text_list) {
+ const int pages = doc->pages();
+ for (int i = 0; i < pages; ++i) {
+ std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
+ std::unique_ptr<poppler::page> p(doc->create_page(i));
+ print_page_text_list(p.get());
+ }
+ }
return 0;
}
More information about the poppler
mailing list