[poppler] 3 commits - cpp/poppler-page.cpp cpp/poppler-page.h cpp/tests

Pino Toscano pino at kemper.freedesktop.org
Wed Sep 15 08:29:46 PDT 2010


 cpp/poppler-page.cpp       |   27 +++++++++++++++++++++-
 cpp/poppler-page.h         |    5 ++++
 cpp/tests/poppler-dump.cpp |   53 +++++++++++++++++++++++++++++++++++++++------
 3 files changed, 76 insertions(+), 9 deletions(-)

New commits:
commit b9333529bba43a71655fdbf1919ba515f7df9ca3
Author: Pino Toscano <pino at kde.org>
Date:   Wed Sep 15 17:23:54 2010 +0200

    [cpp/tests] poppler-dump: convert out_ustring() to an operator<<(std::ostream&)
    
    so we have a chance to better output the bytearray of a string to the stream

diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index e9a068c..cb0ee75 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -81,9 +81,13 @@ static void error(const std::string &msg)
     exit(1);
 }
 
-static std::string out_ustring(const poppler::ustring &str)
+std::ostream& operator<<(std::ostream& stream, const poppler::ustring &str)
 {
-    return str.to_latin1();
+    const poppler::byte_array ba = str.to_utf8();
+    for (unsigned int i = 0; i < ba.size(); ++i) {
+        stream << (char)(ba[i]);
+    }
+    return stream;
 }
 
 static std::string out_date(std::time_t date)
@@ -174,7 +178,7 @@ static void print_info(poppler::document *doc)
     const std::vector<std::string> keys = doc->info_keys();
     std::vector<std::string>::const_iterator key_it = keys.begin(), key_end = keys.end();
     for (; key_it != key_end; ++key_it) {
-        std::cout << std::setw(out_width) << *key_it << ": " << out_ustring(doc->info_key(*key_it)) << std::endl;
+        std::cout << std::setw(out_width) << *key_it << ": " << doc->info_key(*key_it) << std::endl;
     }
     std::cout << std::setw(out_width) << "Date (creation)" << ": " << out_date(doc->info_date("CreationDate")) << std::endl;
     std::cout << std::setw(out_width) << "Date (modification)" << ": " << out_date(doc->info_date("ModDate")) << std::endl;
@@ -205,14 +209,14 @@ static void print_perm(poppler::document *doc)
 static void print_metadata(poppler::document *doc)
 {
     std::cout << std::setw(out_width) << "Metadata" << ":" << std::endl
-              << out_ustring(doc->metadata()) << std::endl;
+              << doc->metadata() << std::endl;
     std::cout << std::endl;
 }
 
 static void print_toc_item(poppler::toc_item *item, int indent)
 {
     std::cout << std::setw(indent * 2) << " "
-              << "+ " << out_ustring(item->title()) << " (" << item->is_open() << ")"
+              << "+ " << item->title() << " (" << item->is_open() << ")"
               << std::endl;
     poppler::toc_item::iterator it = item->children_begin(), it_end = item->children_end();
     for (; it != it_end; ++it) {
@@ -271,7 +275,13 @@ static void print_embedded_files(poppler::document *doc)
                 << " " << std::setw(20) << out_date(f->creation_date())
                 << " " << std::setw(20) << out_date(f->modification_date())
                 << std::endl
-                << "     " << (f->description().empty() ? std::string("<no description>") : out_ustring(f->description()))
+                << "     ";
+            if (f->description().empty()) {
+                std::cout << "<no description>";
+            } else {
+                std::cout << f->description();
+            }
+            std::cout
                 << std::endl
                 << "     " << std::setw(35) << (f->checksum().empty() ? std::string("<no checksum>") : out_hex_string(f->checksum()))
                 << " " << (f->mime_type().empty() ? std::string("<no mime type>") : f->mime_type())
@@ -287,7 +297,7 @@ static void print_embedded_files(poppler::document *doc)
 static void print_page(poppler::page *p)
 {
     std::cout << std::setw(out_width) << "Rect" << ": " << p->page_rect() << std::endl;
-    std::cout << std::setw(out_width) << "Label" << ": " << out_ustring(p->label()) << std::endl;
+    std::cout << std::setw(out_width) << "Label" << ": " << p->label() << std::endl;
     std::cout << std::setw(out_width) << "Duration" << ": " << p->duration() << std::endl;
     std::cout << std::setw(out_width) << "Orientation" << ": " << out_page_orientation(p->orientation()) << std::endl;
     std::cout << std::endl;
@@ -295,7 +305,7 @@ static void print_page(poppler::page *p)
 
 static void print_page_text(poppler::page *p)
 {
-    std::cout << out_ustring(p->text(p->page_rect(), show_text_layout)) << std::endl;
+    std::cout << p->text(p->page_rect(), show_text_layout) << std::endl;
     std::cout << std::endl;
 }
 
commit a44f711b4412332875337e9fb7509f18db806ddc
Author: Pino Toscano <pino at kde.org>
Date:   Wed Sep 15 16:44:30 2010 +0200

    [cpp/tests] poppler-dump: add a "--show-text <physical|raw>" option
    
    ... to show the text of a page in the specified layout

diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index 104aaa4..e9a068c 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -23,6 +23,7 @@
 #include <poppler-toc.h>
 
 #include <cstdlib>
+#include <cstring>
 #include <ctime>
 #include <iomanip>
 #include <iostream>
@@ -44,6 +45,8 @@ bool show_fonts = false;
 bool show_embedded_files = false;
 bool show_pages = false;
 bool show_help = false;
+char show_text[32];
+poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
 
 static const ArgDesc the_args[] = {
     { "--show-all",            argFlag,  &show_all,            0,
@@ -62,6 +65,8 @@ static const ArgDesc the_args[] = {
       "show the document-level embedded files" },
     { "--show-pages",          argFlag,  &show_pages,          0,
       "show pages information" },
+    { "--show-text",           argString, &show_text,          sizeof(show_text),
+      "show text (physical|raw) extracted from all pages" },
     { "-h",                    argFlag,  &show_help,           0,
       "print usage information" },
     { "--help",                argFlag,  &show_help,           0,
@@ -288,6 +293,12 @@ static void print_page(poppler::page *p)
     std::cout << std::endl;
 }
 
+static void print_page_text(poppler::page *p)
+{
+    std::cout << out_ustring(p->text(p->page_rect(), show_text_layout)) << std::endl;
+    std::cout << std::endl;
+}
+
 int main(int argc, char *argv[])
 {
     if (!parseArgs(the_args, &argc, argv)
@@ -296,6 +307,16 @@ int main(int argc, char *argv[])
         exit(1);
     }
 
+    if (show_text[0]) {
+        if (!memcmp(show_text, "physical", 9)) {
+            show_text_layout = poppler::page::physical_layout;
+        } else if (!memcmp(show_text, "raw", 4)) {
+            show_text_layout = poppler::page::raw_order_layout;
+        } else {
+            error(std::string("unrecognized text mode: '") + show_text + "'");
+        }
+    }
+
     std::string file_name(argv[1]);
 
     std::auto_ptr<poppler::document> doc(poppler::document::load_from_file(file_name));
@@ -345,6 +366,14 @@ int main(int argc, char *argv[])
             print_page(p.get());
         }
     }
+    if (show_text[0]) {
+        const int pages = doc->pages();
+        for (int i = 0; i < pages; ++i) {
+            std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
+            std::auto_ptr<poppler::page> p(doc->create_page(i));
+            print_page_text(p.get());
+        }
+    }
 
     return 0;
 }
commit 0094c9372b5b439af2564d83d6fb7439f4bdba88
Author: Pino Toscano <pino at kde.org>
Date:   Wed Sep 15 13:19:13 2010 +0200

    [cpp] add a new page::text() for specifying a layout mode
    
    add a new text_layout_enum enum for the layout mode, used by the new text()
    make the old text() implementation call the new one with the old value (= physical)
    add & adapt the apidox accordingly

diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 1bfb8d4..4e2f730 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -60,6 +60,12 @@ page_private::~page_private()
  The direction/action to follow when performing a text search.
 */
 
+/**
+ \enum poppler::page::text_layout_enum
+
+ A layout of the text of a page.
+*/
+
 
 page::page(document_private *doc, int index)
     : d(new page_private(doc, index))
@@ -234,7 +240,7 @@ bool page::search(const ustring &text, rectf &r, search_direction_enum direction
 }
 
 /**
- Returns the text in the page.
+ Returns the text in the page, in its physical layout.
 
  \param r if not empty, it will be extracted the text in it; otherwise, the
           text of the whole page
@@ -243,8 +249,25 @@ bool page::search(const ustring &text, rectf &r, search_direction_enum direction
  */
 ustring page::text(const rectf &r) const
 {
+    return text(r, physical_layout);
+}
+
+/**
+ Returns the text in the page.
+
+ \param rect if not empty, it will be extracted the text in it; otherwise, the
+             text of the whole page
+ \param layout_mode the layout of the text
+
+ \returns the text of the page in the specified rect or in the whole page
+
+ \since 0.16
+ */
+ustring page::text(const rectf &r, text_layout_enum layout_mode) const
+{
     std::auto_ptr<GooString> s;
-    TextOutputDev td(0, gFalse, gFalse, gFalse);
+    const GBool use_raw_order = (layout_mode == raw_order_layout);
+    TextOutputDev td(0, gFalse, use_raw_order, gFalse);
     d->doc->doc->displayPage(&td, d->index + 1, 72, 72, 0, false, true, false);
     if (r.is_empty()) {
         const PDFRectangle *rect = d->page->getCropBox();
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 89fdea6..7b4298a 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -44,6 +44,10 @@ public:
         search_next_result,
         search_previous_result
     };
+    enum text_layout_enum {
+        physical_layout,
+        raw_order_layout
+    };
 
     ~page();
 
@@ -57,6 +61,7 @@ public:
     bool search(const ustring &text, rectf &r, search_direction_enum direction,
                 case_sensitivity_enum case_sensitivity, rotation_enum rotation = rotate_0) const;
     ustring text(const rectf &rect = rectf()) const;
+    ustring text(const rectf &rect, text_layout_enum layout_mode) const;
 
 private:
     page(document_private *doc, int index);


More information about the poppler mailing list