[poppler] 2 commits - glib/poppler-page.cc glib/tests
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Tue Jul 13 21:34:44 UTC 2021
glib/poppler-page.cc | 9 ++++++---
glib/tests/check_text.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 48 insertions(+), 4 deletions(-)
New commits:
commit fdb83a88ce196413a874c3e0fb6fbd200b56393c
Author: Nelson Benítez León <nbenitezl at gmail.com>
Date: Mon Jul 5 15:42:44 2021 -0400
glib: mimick TextSelectionDumper logic change for spaceAfter
Commit d6cccfb8d814d89c51c9e65563be2e475f46212b caused
issue #1100 because that change in the TextSelectionDumper
logic *must be mimicked* in poppler_page_get_text_layout_for_area()
and in poppler_page_get_text_attributes_for_area() because
all those functions must be consistent with each other in
the way they traverse and extract the text from the PDF.
Otherwise, wrong results may happen when using them
to map between graphical coordinates of text glyphs and
their corresponding positions in the text obtained from
poppler_page_get_text() (which uses TextSelectionDumper
to extract the text).
Fixes issue #1100
diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index e81c1e12..684cc07f 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -2332,8 +2332,11 @@ gboolean poppler_page_get_text_layout_for_area(PopplerPage *page, PopplerRectang
for (i = 0; i < n_lines; i++) {
std::vector<TextWordSelection *> *line_words = word_list[i];
n_rects += line_words->size() - 1;
- for (const TextWordSelection *word_sel : *line_words) {
+ for (std::size_t j = 0; j < line_words->size(); j++) {
+ const TextWordSelection *word_sel = (*line_words)[j];
n_rects += word_sel->getEnd() - word_sel->getBegin();
+ if (!word_sel->getWord()->hasSpaceAfter() && j < line_words->size() - 1)
+ n_rects--;
}
}
@@ -2356,7 +2359,7 @@ gboolean poppler_page_get_text_layout_for_area(PopplerPage *page, PopplerRectang
rect = *rectangles + offset;
word->getBBox(&x1, &y1, &x2, &y2);
- if (j < line_words->size() - 1) {
+ if (word->hasSpaceAfter() && j < line_words->size() - 1) {
TextWordSelection *next_word_sel = (*line_words)[j + 1];
next_word_sel->getWord()->getBBox(&x3, &y3, &x4, &y4);
@@ -2514,7 +2517,7 @@ GList *poppler_page_get_text_attributes_for_area(PopplerPage *page, PopplerRecta
prev_word_i = word_i;
}
- if (j < line_words->size() - 1) {
+ if (word->hasSpaceAfter() && j < line_words->size() - 1) {
attrs->end_index = offset;
offset++;
}
commit e2f7f5e8eae0cb13d88af4400d68697c6e6bf5ed
Author: Nelson Benítez León <nbenitezl at gmail.com>
Date: Sat Jul 10 00:13:46 2021 -0400
Add glib test for issue #1100
diff --git a/glib/tests/check_text.c b/glib/tests/check_text.c
index 8b10a7a0..9b0c5b61 100644
--- a/glib/tests/check_text.c
+++ b/glib/tests/check_text.c
@@ -16,6 +16,8 @@ int main(int argc, char *argv[])
GFile *infile;
PopplerDocument *doc;
PopplerPage *page;
+ PopplerRectangle *areas = NULL;
+ guint n_glyph_areas, n_utf8_chars;
int npages, n;
char *text;
GError *err = NULL;
@@ -48,7 +50,46 @@ int main(int argc, char *argv[])
text = poppler_page_get_text(page);
g_print("%s\n", text);
g_assert_cmpstr(text, ==, "The slow brown fox jumps over the black dog.");
- g_object_unref(page);
+
+ /* Cleanup vars for next test */
+ g_clear_object(&page);
+ g_clear_object(&doc);
+ g_clear_object(&infile);
+ g_clear_pointer(&text, g_free);
+
+ /* Test for consistency between utf8 characters returned by poppler_page_get_text()
+ * and glyph layout areas returned by poppler_page_get_text_layout(). Issue #1100 */
+ g_print("Consistency test between poppler_page_get_text() and poppler_page_get_text_layout()\n");
+ g_print("Issue #1100 \n");
+ infile = g_file_new_for_path(TESTDATADIR "/unittestcases/searchAcrossLines.pdf");
+ if (!infile)
+ exit(EXIT_FAILURE);
+
+ doc = poppler_document_new_from_gfile(infile, NULL, NULL, &err);
+ if (doc == NULL) {
+ g_printerr("error opening pdf file: %s\n", err->message);
+ g_error_free(err);
+ exit(EXIT_FAILURE);
+ }
+
+ page = poppler_document_get_page(doc, 0);
+ if (page == NULL || !POPPLER_IS_PAGE(page)) {
+ g_print("error opening pdf page\n");
+ exit(EXIT_FAILURE);
+ }
+
+ text = poppler_page_get_text(page);
+ n_utf8_chars = (guint)g_utf8_strlen(text, -1);
+ poppler_page_get_text_layout(page, &areas, &n_glyph_areas);
+ g_assert_cmpuint(n_glyph_areas, ==, n_utf8_chars);
+ g_print("Test: OK ('layout glyph areas' match amount of 'utf8 characters')\n");
+
+ /* Cleanup vars for next test */
+ g_clear_object(&page);
+ g_clear_object(&doc);
+ g_clear_object(&infile);
+ g_clear_pointer(&areas, g_free);
+ g_clear_pointer(&text, g_free);
return EXIT_SUCCESS;
}
More information about the poppler
mailing list