[poppler] glib/demo glib/poppler.h glib/poppler-page.cc glib/poppler-page.h glib/poppler-private.h glib/reference poppler/TextOutputDev.cc poppler/TextOutputDev.h qt5/src qt5/tests qt6/src qt6/tests
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Sun Apr 25 22:24:52 UTC 2021
glib/demo/find.c | 83 +++++++++++-----
glib/poppler-page.cc | 144 +++++++++++++++++++++++++---
glib/poppler-page.h | 4
glib/poppler-private.h | 19 +++
glib/poppler.h | 7 +
glib/reference/poppler-sections.txt | 2
poppler/TextOutputDev.cc | 182 +++++++++++++++++++++++++++++-------
poppler/TextOutputDev.h | 15 ++
qt5/src/poppler-page-private.h | 4
qt5/src/poppler-page.cc | 39 +++++--
qt5/src/poppler-qt5.h | 12 +-
qt5/tests/check_search.cpp | 128 +++++++++++++++++++++----
qt6/src/poppler-page-private.h | 4
qt6/src/poppler-page.cc | 35 +++++-
qt6/src/poppler-qt6.h | 12 +-
qt6/tests/check_search.cpp | 85 ++++++++++++++++
16 files changed, 652 insertions(+), 123 deletions(-)
New commits:
commit e3fed321f230a4a91df873e6d9a213ba8dad6694
Author: Nelson Benítez León <nbenitezl at gmail.com>
Date: Sun Apr 25 22:24:50 2021 +0000
find, glib: Enhance find to support multi-line matching
On the backend side, adds 3 new parameters to TextPage::findText(),
one bool to enable the feature, one out PDFRectangle to store
the part of the match that falls on the next line, and one out
bool to inform whether hyphen was present and ignored at end of
the previous match part.
For the glib binding, this extends the public PopplerRectangle
struct by new members to hold additional information about
whether the rectangle belongs to a group of rectangles for the
same match, and whether a hyphen was ignored at the end of the
line. Since PopplerRectangle is public ABI, this is done by making
the public PopplerRectangle API return the enlarged struct, and
internally casting to the new struct when required, the new
members are accessible only via accessor functions.
For Qt5 Qt6 bindings, this commit only implements the new flag
Poppler::Page::AcrossLines (but no new function and no new
return data type) and if this flag is passed, the returned
list of rectangles will also include rectangles for the
second part of across-line matches.
This minimum Qt bindings still allows for the creation of
tests for this feature (using the Qt test framework) which
this commit *do includes*. But a more complete binding (with
a new return type that includes 'matchContinued' and 'ignoredHypen'
boolean fields) is left to do for qt backend maintainers
if they want to use this feature in eg. Okular.
So, as mentioned, this commit incorporates tests for the
implemented across-line matching feature, and the tests do
also check for two included aspects of this feature, which are:
- Ignoring hyphen character while matching when 1) it's the
last character of the line and 2) its corresponding matching
character in the search term is not an hyphen too.
- Any whitespace characters in the search term will be allowed
to match on the logic position where the lines split (i.e. what
would normally be the newline character in a text file, but
PDF text does not include newline characters between lines).
Regarding the enhancement to findText() function which implements
matching across lines, just two more notes:
- It won't match on text spanning more than two lines, i.e. it
only matches text spanning from end of one line to start of
next line.
- It does not supports finding backwards, if findText() receives
both <backward> and <matchAcrossLines> parameters as true, it
will ignore the <matchAcrossLines> parameter. Implementing
<matchAcrossLines> with backwards direction is possible, but
it will make an already complex function like findText() to be
even more complex, for little gain as eg. Evince does not even
use the <backward> parameter of findText().
Fixes poppler issues #744 and #755
Related Evince issue https://gitlab.gnome.org/GNOME/evince/issues/333
diff --git a/glib/demo/find.c b/glib/demo/find.c
index b3ae9101..080bc196 100644
--- a/glib/demo/find.c
+++ b/glib/demo/find.c
@@ -85,6 +85,23 @@ static void pgd_find_update_progress(PgdFindDemo *demo, gint scanned)
g_free(str);
}
+static void pgd_find_append_match(PgdFindDemo *demo, GtkTreeModel *model, GtkTreeIter *iter_child, PopplerRectangle *rect, int match_id)
+{
+ char *x1, *y1, *x2, *y2, *str;
+ str = g_strdup_printf("Match %d", match_id + 1);
+ x1 = g_strdup_printf("%.2f", rect->x1);
+ y1 = g_strdup_printf("%.2f", rect->y1);
+ x2 = g_strdup_printf("%.2f", rect->x2);
+ y2 = g_strdup_printf("%.2f", rect->y2);
+ gtk_tree_store_set(GTK_TREE_STORE(model), iter_child, TITLE_COLUMN, str, X1_COLUMN, x1, Y1_COLUMN, y1, X2_COLUMN, x2, Y2_COLUMN, y2, VISIBLE_COLUMN, TRUE, PAGE_COLUMN, demo->page_index, PAGE_RECT, rect, -1);
+ g_free(str);
+ g_free(x1);
+ g_free(y1);
+ g_free(x2);
+ g_free(y2);
+ g_object_weak_ref(G_OBJECT(model), (GWeakNotify)poppler_rectangle_free, rect);
+}
+
static gboolean pgd_find_find_text(PgdFindDemo *demo)
{
PopplerPage *page;
@@ -103,46 +120,31 @@ static gboolean pgd_find_find_text(PgdFindDemo *demo)
matches = poppler_page_find_text_with_options(page, gtk_entry_get_text(GTK_ENTRY(demo->entry)), demo->options);
g_timer_stop(timer);
if (matches) {
- GtkTreeIter iter;
+ GtkTreeIter iter, iter_child;
gchar *str;
GList *l;
gdouble height;
gint n_match = 0;
- str = g_strdup_printf("%d matches found on page %d in %.4f seconds", g_list_length(matches), demo->page_index + 1, g_timer_elapsed(timer, NULL));
-
gtk_tree_store_append(GTK_TREE_STORE(model), &iter, NULL);
- gtk_tree_store_set(GTK_TREE_STORE(model), &iter, TITLE_COLUMN, str, VISIBLE_COLUMN, FALSE, PAGE_COLUMN, demo->page_index, -1);
- g_free(str);
-
poppler_page_get_size(page, NULL, &height);
-
for (l = matches; l && l->data; l = g_list_next(l)) {
PopplerRectangle *rect = (PopplerRectangle *)l->data;
- GtkTreeIter iter_child;
- gchar *x1, *y1, *x2, *y2;
gdouble tmp;
-
- str = g_strdup_printf("Match %d", ++n_match);
- x1 = g_strdup_printf("%.2f", rect->x1);
- y1 = g_strdup_printf("%.2f", rect->y1);
- x2 = g_strdup_printf("%.2f", rect->x2);
- y2 = g_strdup_printf("%.2f", rect->y2);
-
tmp = rect->y1;
rect->y1 = height - rect->y2;
rect->y2 = height - tmp;
-
gtk_tree_store_append(GTK_TREE_STORE(model), &iter_child, &iter);
- gtk_tree_store_set(GTK_TREE_STORE(model), &iter_child, TITLE_COLUMN, str, X1_COLUMN, x1, Y1_COLUMN, y1, X2_COLUMN, x2, Y2_COLUMN, y2, VISIBLE_COLUMN, TRUE, PAGE_COLUMN, demo->page_index, PAGE_RECT, rect, -1);
- g_free(str);
- g_free(x1);
- g_free(y1);
- g_free(x2);
- g_free(y2);
- g_object_weak_ref(G_OBJECT(model), (GWeakNotify)poppler_rectangle_free, rect);
+ pgd_find_append_match(demo, model, &iter_child, rect, n_match);
+ if (!poppler_rectangle_find_get_match_continued(rect))
+ ++n_match;
}
g_list_free(matches);
+
+ str = g_strdup_printf("%d matches found on page %d in %.4f seconds", n_match, demo->page_index + 1, g_timer_elapsed(timer, NULL));
+
+ gtk_tree_store_set(GTK_TREE_STORE(model), &iter, TITLE_COLUMN, str, VISIBLE_COLUMN, FALSE, PAGE_COLUMN, demo->page_index, -1);
+ g_free(str);
}
g_timer_destroy(timer);
@@ -154,6 +156,11 @@ static gboolean pgd_find_find_text(PgdFindDemo *demo)
return demo->page_index < demo->n_pages;
}
+static void find_text_idle_finish(PgdFindDemo *demo)
+{
+ demo->idle_id = 0;
+}
+
static cairo_surface_t *pgd_find_render_page(PgdFindDemo *demo)
{
cairo_t *cr;
@@ -252,7 +259,7 @@ static void pgd_find_button_clicked(GtkButton *button, PgdFindDemo *demo)
pgd_find_update_progress(demo, demo->page_index);
if (demo->idle_id > 0)
g_source_remove(demo->idle_id);
- demo->idle_id = g_idle_add((GSourceFunc)pgd_find_find_text, demo);
+ demo->idle_id = g_idle_add_full(G_PRIORITY_DEFAULT_IDLE, (GSourceFunc)pgd_find_find_text, demo, (GDestroyNotify)find_text_idle_finish);
}
static void pgd_find_button_sensitivity_cb(GtkWidget *button, GtkEntry *entry)
@@ -309,6 +316,22 @@ static void pgd_find_backwards_toggled(GtkToggleButton *togglebutton, PgdFindDem
demo->options &= ~POPPLER_FIND_BACKWARDS;
}
+static void pgd_find_multiline_toggled(GtkToggleButton *togglebutton, PgdFindDemo *demo)
+{
+ if (gtk_toggle_button_get_active(togglebutton))
+ demo->options |= POPPLER_FIND_MULTILINE;
+ else
+ demo->options &= ~POPPLER_FIND_MULTILINE;
+}
+
+static void pgd_find_ignore_diacritics_toggled(GtkToggleButton *togglebutton, PgdFindDemo *demo)
+{
+ if (gtk_toggle_button_get_active(togglebutton))
+ demo->options |= POPPLER_FIND_IGNORE_DIACRITICS;
+ else
+ demo->options &= ~POPPLER_FIND_IGNORE_DIACRITICS;
+}
+
static void pgd_find_whole_words_toggled(GtkToggleButton *togglebutton, PgdFindDemo *demo)
{
if (gtk_toggle_button_get_active(togglebutton))
@@ -345,6 +368,16 @@ GtkWidget *pgd_find_create_widget(PopplerDocument *document)
hbox = gtk_box_new(GTK_ORIENTATION_HORIZONTAL, 6);
+ checkbutton = gtk_check_button_new_with_label("Multi-line");
+ g_signal_connect(checkbutton, "toggled", G_CALLBACK(pgd_find_multiline_toggled), demo);
+ gtk_box_pack_start(GTK_BOX(hbox), checkbutton, FALSE, FALSE, 0);
+ gtk_widget_show(checkbutton);
+
+ checkbutton = gtk_check_button_new_with_label("Ignore diacritics");
+ g_signal_connect(checkbutton, "toggled", G_CALLBACK(pgd_find_ignore_diacritics_toggled), demo);
+ gtk_box_pack_start(GTK_BOX(hbox), checkbutton, FALSE, FALSE, 0);
+ gtk_widget_show(checkbutton);
+
demo->entry = gtk_entry_new();
gtk_box_pack_start(GTK_BOX(hbox), demo->entry, FALSE, TRUE, 0);
gtk_widget_show(demo->entry);
diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 3332a9eb..7536cfcb 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -47,6 +47,8 @@ enum
PROP_LABEL
};
+static PopplerRectangleExtended *poppler_rectangle_extended_new();
+
typedef struct _PopplerPageClass PopplerPageClass;
struct _PopplerPageClass
{
@@ -615,12 +617,7 @@ GList *poppler_page_get_selection_region(PopplerPage *page, gdouble scale, Poppl
for (const PDFRectangle *selection_rect : *list) {
PopplerRectangle *rect;
- rect = poppler_rectangle_new();
-
- rect->x1 = selection_rect->x1;
- rect->y1 = selection_rect->y1;
- rect->x2 = selection_rect->x2;
- rect->y2 = selection_rect->y2;
+ rect = poppler_rectangle_new_from_pdf_rectangle(selection_rect);
region = g_list_prepend(region, rect);
@@ -811,15 +808,33 @@ char *poppler_page_get_text_for_area(PopplerPage *page, PopplerRectangle *area)
* returns a #GList of rectangles for each occurrence of the text on the page.
* The coordinates are in PDF points.
*
- * Return value: (element-type PopplerRectangle) (transfer full): a #GList of #PopplerRectangle,
+ * When %POPPLER_FIND_MULTILINE is passed in @options, matches may span more than
+ * one line. In this case, the returned list will contain one #PopplerRectangle
+ * for each part of a match. The function poppler_rectangle_find_get_match_continued()
+ * will return %TRUE for all rectangles belonging to the same match, except for
+ * the last one. If a hyphen was ignored at the end of the part of the match,
+ * poppler_rectangle_find_get_ignored_hyphen() will return %TRUE for that
+ * rectangle.
+ *
+ * Note that currently matches spanning more than two lines are not found.
+ * (This limitation may be lifted in a future version.)
+ *
+ * Note also that currently finding multi-line matches backwards is not
+ * implemented; if you pass %POPPLER_FIND_BACKWARDS and %POPPLER_FIND_MULTILINE
+ * together, %POPPLER_FIND_MULTILINE will be ignored.
+ *
+ * Return value: (element-type PopplerRectangle) (transfer full): a newly allocated list
+ * of newly allocated #PopplerRectangle. Free with g_list_free_full() using poppler_rectangle_free().
*
* Since: 0.22
**/
GList *poppler_page_find_text_with_options(PopplerPage *page, const char *text, PopplerFindFlags options)
{
- PopplerRectangle *match;
+ PopplerRectangleExtended *match;
GList *matches;
double xMin, yMin, xMax, yMax;
+ PDFRectangle continueMatch;
+ bool ignoredHyphen;
gunichar *ucs4;
glong ucs4_len;
double height;
@@ -835,22 +850,46 @@ GList *poppler_page_find_text_with_options(PopplerPage *page, const char *text,
ucs4 = g_utf8_to_ucs4_fast(text, -1, &ucs4_len);
poppler_page_get_size(page, nullptr, &height);
+ const bool multiline = (options & POPPLER_FIND_MULTILINE);
backwards = options & POPPLER_FIND_BACKWARDS;
matches = nullptr;
xMin = 0;
yMin = backwards ? height : 0;
+ continueMatch.x1 = G_MAXDOUBLE; // we use this to detect valid returned values
+
while (text_dev->findText(ucs4, ucs4_len, false, true, // startAtTop, stopAtBottom
start_at_last,
false, // stopAtLast
- options & POPPLER_FIND_CASE_SENSITIVE, options & POPPLER_FIND_IGNORE_DIACRITICS, backwards, options & POPPLER_FIND_WHOLE_WORDS_ONLY, &xMin, &yMin, &xMax, &yMax)) {
- match = poppler_rectangle_new();
+ options & POPPLER_FIND_CASE_SENSITIVE, options & POPPLER_FIND_IGNORE_DIACRITICS, options & POPPLER_FIND_MULTILINE, backwards, options & POPPLER_FIND_WHOLE_WORDS_ONLY, &xMin, &yMin, &xMax, &yMax, &continueMatch,
+ &ignoredHyphen)) {
+ match = poppler_rectangle_extended_new();
match->x1 = xMin;
match->y1 = height - yMax;
match->x2 = xMax;
match->y2 = height - yMin;
+ match->match_continued = false;
+ match->ignored_hyphen = false;
matches = g_list_prepend(matches, match);
start_at_last = TRUE;
+
+ if (continueMatch.x1 != G_MAXDOUBLE) {
+ // received rect for next-line part of a multi-line match, add it.
+ if (multiline) {
+ match->match_continued = true;
+ match->ignored_hyphen = ignoredHyphen;
+ match = poppler_rectangle_extended_new();
+ match->x1 = continueMatch.x1;
+ match->y1 = height - continueMatch.y1;
+ match->x2 = continueMatch.x2;
+ match->y2 = height - continueMatch.y2;
+ match->match_continued = false;
+ match->ignored_hyphen = false;
+ matches = g_list_prepend(matches, match);
+ }
+
+ continueMatch.x1 = G_MAXDOUBLE;
+ }
}
g_free(ucs4);
@@ -1565,6 +1604,22 @@ void poppler_page_remove_annot(PopplerPage *page, PopplerAnnot *annot)
G_DEFINE_BOXED_TYPE(PopplerRectangle, poppler_rectangle, poppler_rectangle_copy, poppler_rectangle_free)
+static PopplerRectangleExtended *poppler_rectangle_extended_new()
+{
+ return g_slice_new0(PopplerRectangleExtended);
+}
+
+PopplerRectangle *poppler_rectangle_new_from_pdf_rectangle(const PDFRectangle *rect)
+{
+ auto r = poppler_rectangle_extended_new();
+ r->x1 = rect->x1;
+ r->y1 = rect->y1;
+ r->x2 = rect->x2;
+ r->y2 = rect->y2;
+
+ return reinterpret_cast<PopplerRectangle *>(r);
+}
+
/**
* poppler_rectangle_new:
*
@@ -1574,36 +1629,95 @@ G_DEFINE_BOXED_TYPE(PopplerRectangle, poppler_rectangle, poppler_rectangle_copy,
*/
PopplerRectangle *poppler_rectangle_new(void)
{
- return g_slice_new0(PopplerRectangle);
+ return reinterpret_cast<PopplerRectangle *>(poppler_rectangle_extended_new());
}
/**
* poppler_rectangle_copy:
* @rectangle: a #PopplerRectangle to copy
*
- * Creates a copy of @rectangle
+ * Creates a copy of @rectangle.
*
+ * Note that you must only use this function on an allocated PopplerRectangle, as
+ * returned by poppler_rectangle_new(), poppler_rectangle_copy(), or the list elements
+ * returned from poppler_page_find_text() or poppler_page_find_text_with_options().
* Returns: a new allocated copy of @rectangle
*/
PopplerRectangle *poppler_rectangle_copy(PopplerRectangle *rectangle)
{
g_return_val_if_fail(rectangle != nullptr, NULL);
- return g_slice_dup(PopplerRectangle, rectangle);
+ auto ext_rectangle = reinterpret_cast<PopplerRectangleExtended *>(rectangle);
+ return reinterpret_cast<PopplerRectangle *>(g_slice_dup(PopplerRectangleExtended, ext_rectangle));
}
/**
* poppler_rectangle_free:
* @rectangle: a #PopplerRectangle
*
- * Frees the given #PopplerRectangle
+ * Frees the given #PopplerRectangle.
+ *
+ * Note that you must only use this function on an allocated PopplerRectangle, as
+ * returned by poppler_rectangle_new(), poppler_rectangle_copy(), or the list elements
+ * returned from poppler_page_find_text() or poppler_page_find_text_with_options().
*/
void poppler_rectangle_free(PopplerRectangle *rectangle)
{
g_slice_free(PopplerRectangle, rectangle);
}
-/* PopplerPoint type */
+/**
+ * poppler_rectangle_find_get_match_continued:
+ * @rectangle: a #PopplerRectangle
+ *
+ * When using poppler_page_find_text_with_options() with the
+ * %POPPLER_FIND_MULTILINE flag, a match may span more than one line
+ * and thus consist of more than one rectangle. Every rectangle belonging
+ * to the same match will return %TRUE from this function, except for
+ * the last rectangle, where this function will return %FALSE.
+ *
+ * Note that you must only call this function on a #PopplerRectangle
+ * returned in the list from poppler_page_find_text() or
+ * poppler_page_find_text_with_options().
+ *
+ * Returns: whether there are more rectangles belonging to the same match
+ *
+ * Since: 21.05.0
+ */
+gboolean poppler_rectangle_find_get_match_continued(const PopplerRectangle *rectangle)
+{
+ g_return_val_if_fail(rectangle != nullptr, false);
+
+ auto ext_rectangle = reinterpret_cast<const PopplerRectangleExtended *>(rectangle);
+ return ext_rectangle->match_continued;
+}
+
+/**
+ * poppler_rectangle_find_get_ignored_hyphen:
+ * @rectangle: a #PopplerRectangle
+ *
+ * When using poppler_page_find_text_with_options() with the
+ * %POPPLER_FIND_MULTILINE flag, a match may span more than one line,
+ * and may have been formed by ignoring a hyphen at the end of the line.
+ * When this happens at the end of the line corresponding to @rectangle,
+ * this function returns %TRUE (and then poppler_rectangle_find_get_match_continued()
+ * will also return %TRUE); otherwise it returns %FALSE.
+ *
+ * Note that you must only call this function on a #PopplerRectangle
+ * returned in the list from poppler_page_find_text() or
+ * poppler_page_find_text_with_options().
+ *
+ * Returns: whether a hyphen was ignored at the end of the line corresponding to @rectangle.
+ *
+ * Since: 21.05.0
+ */
+gboolean poppler_rectangle_find_get_ignored_hyphen(const PopplerRectangle *rectangle)
+{
+ g_return_val_if_fail(rectangle != nullptr, false);
+
+ auto ext_rectangle = reinterpret_cast<const PopplerRectangleExtended *>(rectangle);
+ return ext_rectangle->ignored_hyphen;
+}
G_DEFINE_BOXED_TYPE(PopplerPoint, poppler_point, poppler_point_copy, poppler_point_free)
diff --git a/glib/poppler-page.h b/glib/poppler-page.h
index 95b0cf9c..2d037d8e 100644
--- a/glib/poppler-page.h
+++ b/glib/poppler-page.h
@@ -140,6 +140,10 @@ POPPLER_PUBLIC
PopplerRectangle *poppler_rectangle_copy(PopplerRectangle *rectangle);
POPPLER_PUBLIC
void poppler_rectangle_free(PopplerRectangle *rectangle);
+POPPLER_PUBLIC
+gboolean poppler_rectangle_find_get_match_continued(const PopplerRectangle *rectangle);
+POPPLER_PUBLIC
+gboolean poppler_rectangle_find_get_ignored_hyphen(const PopplerRectangle *rectangle);
/* A point on a page, with coordinates in PDF points. */
#define POPPLER_TYPE_POINT (poppler_point_get_type())
diff --git a/glib/poppler-private.h b/glib/poppler-private.h
index 10272716..02967fbf 100644
--- a/glib/poppler-private.h
+++ b/glib/poppler-private.h
@@ -112,6 +112,25 @@ struct _PopplerStructureElement
const StructElement *elem;
};
+/*
+ * PopplerRectangleExtended:
+ *
+ * The real type behind the public PopplerRectangle.
+ * Must be ABI compatible to it!
+ */
+typedef struct
+{
+ /*< private >*/
+ double x1;
+ double y1;
+ double x2;
+ double y2;
+ bool match_continued; /* Described in poppler_rectangle_find_get_match_continued() */
+ bool ignored_hyphen; /* Described in poppler_rectangle_find_get_ignored_hyphen() */
+} PopplerRectangleExtended;
+
+PopplerRectangle *poppler_rectangle_new_from_pdf_rectangle(const PDFRectangle *rect);
+
GList *_poppler_document_get_layers(PopplerDocument *document);
GList *_poppler_document_get_layer_rbgroup(PopplerDocument *document, Layer *layer);
PopplerPage *_poppler_page_new(PopplerDocument *document, Page *page, int index);
diff --git a/glib/poppler.h b/glib/poppler.h
index 5692c28d..35a3bfd4 100644
--- a/glib/poppler.h
+++ b/glib/poppler.h
@@ -157,6 +157,10 @@ typedef enum /*< flags >*/
* @POPPLER_FIND_IGNORE_DIACRITICS: do diacritics insensitive search,
* i.e. ignore accents, umlauts, diaeresis,etc. while matching. This
* option will be ignored if the search term is not pure ascii. Since 0.73.
+ * @POPPLER_FIND_MULTILINE: allows to match on text spanning from
+ * end of a line to the next line. (Currently it won't match on text spanning
+ * more than two lines.) Automatically ignores hyphen at end of line, and
+ * allows whitespace in search term to match on newline char. Since: 21.05.0.
*
* Flags using while searching text in a page
*
@@ -168,7 +172,8 @@ typedef enum /*< flags >*/
POPPLER_FIND_CASE_SENSITIVE = 1 << 0,
POPPLER_FIND_BACKWARDS = 1 << 1,
POPPLER_FIND_WHOLE_WORDS_ONLY = 1 << 2,
- POPPLER_FIND_IGNORE_DIACRITICS = 1 << 3
+ POPPLER_FIND_IGNORE_DIACRITICS = 1 << 3,
+ POPPLER_FIND_MULTILINE = 1 << 4
} PopplerFindFlags;
typedef struct _PopplerDocument PopplerDocument;
diff --git a/glib/reference/poppler-sections.txt b/glib/reference/poppler-sections.txt
index b6f8ecc9..8ad07b09 100644
--- a/glib/reference/poppler-sections.txt
+++ b/glib/reference/poppler-sections.txt
@@ -92,6 +92,8 @@ poppler_quadrilateral_copy
poppler_quadrilateral_free
poppler_quadrilateral_new
poppler_rectangle_copy
+poppler_rectangle_find_get_match_continued
+poppler_rectangle_find_get_ignored_hyphen
poppler_rectangle_free
poppler_rectangle_new
poppler_text_attributes_copy
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 956c1328..7bc1b920 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -3798,22 +3798,62 @@ void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
#endif
}
+void TextPage::adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax)
+{
+ switch (line->rot) {
+ case 0:
+ *xMin = line->edge[start];
+ *xMax = line->edge[end];
+ *yMin = line->yMin;
+ *yMax = line->yMax;
+ break;
+ case 1:
+ *xMin = line->xMin;
+ *xMax = line->xMax;
+ *yMin = line->edge[start];
+ *yMax = line->edge[end];
+ break;
+ case 2:
+ *xMin = line->edge[end];
+ *xMax = line->edge[start];
+ *yMin = line->yMin;
+ *yMax = line->yMax;
+ break;
+ case 3:
+ *xMin = line->xMin;
+ *xMax = line->xMax;
+ *yMin = line->edge[end];
+ *yMax = line->edge[start];
+ break;
+ }
+}
+
bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax)
{
- return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, false, backward, wholeWord, xMin, yMin, xMax, yMax);
+ return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, false, false, backward, wholeWord, xMin, yMin, xMax, yMax, nullptr, nullptr);
}
bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
double *yMax)
+{
+ return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics, false, backward, wholeWord, xMin, yMin, xMax, yMax, nullptr, nullptr);
+}
+
+bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin,
+ double *yMin, double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen)
{
TextBlock *blk;
TextLine *line;
Unicode *s2, *txt, *reordered;
Unicode *p;
+ Unicode *nextline;
+ int nextline_len;
+ bool nextlineAfterHyphen = false;
int txtSize, m, i, j, k;
double xStart, yStart, xStop, yStop;
double xMin0, yMin0, xMax0, yMax0;
double xMin1, yMin1, xMax1, yMax1;
+ double xMin2, yMin2, xMax2, yMax2;
bool found;
if (len == 0) {
@@ -3824,6 +3864,11 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
return false;
}
+ if (matchAcrossLines && backward) {
+ // matchAcrossLines is unimplemented for backward search
+ matchAcrossLines = false;
+ }
+
// handle right-to-left text
reordered = (Unicode *)gmallocn(len, sizeof(Unicode));
reorderText(s, len, nullptr, primaryLR, nullptr, reordered);
@@ -3907,6 +3952,12 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
if (!line->normalized)
line->normalized = unicodeNormalizeNFKC(line->text, line->len, &line->normalized_len, &line->normalized_idx, true);
+
+ if (matchAcrossLines && line->next && !line->next->normalized)
+ line->next->normalized = unicodeNormalizeNFKC(line->next->text, line->next->len, &line->next->normalized_len, &line->next->normalized_idx, true);
+ nextline = nullptr;
+ nextline_len = 0;
+
// convert the line to uppercase
m = line->normalized_len;
@@ -3917,6 +3968,9 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
m = line->ascii_len;
else
ignoreDiacritics = false;
+
+ if (matchAcrossLines && line->next && !line->next->ascii_translation)
+ unicodeToAscii7(line->next->normalized, line->next->normalized_len, &line->next->ascii_translation, &line->next->ascii_len, line->next->normalized_idx, &line->next->ascii_idx);
}
if (!caseSensitive) {
if (m > txtSize) {
@@ -3929,65 +3983,111 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
else
txt[k] = unicodeToUpper(line->normalized[k]);
}
+ if (matchAcrossLines && line->next) {
+ nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
+ nextline = (Unicode *)gmallocn(nextline_len, sizeof(Unicode));
+ for (k = 0; k < nextline_len; ++k) {
+ nextline[k] = ignoreDiacritics ? unicodeToUpper(line->next->ascii_translation[k]) : unicodeToUpper(line->next->normalized[k]);
+ }
+ }
} else {
if (ignoreDiacritics)
txt = line->ascii_translation;
else
txt = line->normalized;
+
+ if (matchAcrossLines && line->next) {
+ nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
+ nextline = ignoreDiacritics ? line->next->ascii_translation : line->next->normalized;
+ }
}
// search each position in this line
j = backward ? m - len : 0;
p = txt + j;
- while (backward ? j >= 0 : j <= m - len) {
- if (!wholeWord || ((j == 0 || !unicodeTypeAlphaNum(txt[j - 1])) && (j + len == m || !unicodeTypeAlphaNum(txt[j + len])))) {
+ while (backward ? j >= 0 : j <= m - (nextline ? 1 : len)) {
+ bool wholeWordStartIsOk, wholeWordEndIsOk;
+ if (wholeWord) {
+ wholeWordStartIsOk = j == 0 || !unicodeTypeAlphaNum(txt[j - 1]);
+ if (nextline)
+ wholeWordEndIsOk = true; // word end may be in next line, so we'll check it later
+ else
+ wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]);
+ }
+ if (!wholeWord || (wholeWordStartIsOk && wholeWordEndIsOk)) {
+ int n = 0;
+ bool spaceConsumedByNewline = false;
+ bool found_it;
// compare the strings
for (k = 0; k < len; ++k) {
- if (p[k] != s2[k]) {
+ bool last_char_of_line = j + k == m - 1;
+ bool last_char_of_search_term = k == len - 1;
+
+ if (p[k] != s2[k] || (nextline && last_char_of_line && !last_char_of_search_term)) {
+ // now check if the comparison failed at the end-of-line hyphen,
+ // and if so, keep on comparing at the next line
+ nextlineAfterHyphen = false;
+
+ if (s2[k] == p[k]) {
+ if (p[k] != (Unicode)'-' && !UnicodeIsWhitespace(s2[k + 1])) {
+ break;
+ }
+ k++;
+ } else if (p[k] != (Unicode)'-' || UnicodeIsWhitespace(s2[k]))
+ break;
+ else
+ nextlineAfterHyphen = true;
+
+ for (; n < nextline_len && k < len; ++k, ++n) {
+ if (nextline[n] != s2[k]) {
+ if (!spaceConsumedByNewline && !n && UnicodeIsWhitespace(s2[k])) {
+ n = -1;
+ spaceConsumedByNewline = true;
+ continue;
+ }
+ break;
+ }
+ }
break;
}
}
+ found_it = k == len;
+ if (found_it && nextline && wholeWord) { // check word end for nextline case
+ if (n) // Match ended at next line
+ wholeWordEndIsOk = n == nextline_len || !unicodeTypeAlphaNum(nextline[n]);
+ else // Match ended on same line
+ wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]);
+
+ if (!wholeWordEndIsOk)
+ found_it = false;
+ }
// found it
- if (k == len) {
+ if (found_it) {
+ bool nextLineMatch = (bool)n;
+ if (spaceConsumedByNewline)
+ k--;
// where s2 matches a subsequence of a compatibility equivalence
// decomposition, highlight the entire glyph, since we don't know
// the internal layout of subglyph components
int normStart, normAfterEnd;
if (ignoreDiacritics) {
normStart = line->ascii_idx[j];
- normAfterEnd = line->ascii_idx[j + len - 1] + 1;
+ if (nextline)
+ normAfterEnd = line->ascii_idx[j + k - n];
+ else
+ normAfterEnd = line->ascii_idx[j + len - 1] + 1;
} else {
normStart = line->normalized_idx[j];
- normAfterEnd = line->normalized_idx[j + len - 1] + 1;
- }
- switch (line->rot) {
- case 0:
- xMin1 = line->edge[normStart];
- xMax1 = line->edge[normAfterEnd];
- yMin1 = line->yMin;
- yMax1 = line->yMax;
- break;
- case 1:
- xMin1 = line->xMin;
- xMax1 = line->xMax;
- yMin1 = line->edge[normStart];
- yMax1 = line->edge[normAfterEnd];
- break;
- case 2:
- xMin1 = line->edge[normAfterEnd];
- xMax1 = line->edge[normStart];
- yMin1 = line->yMin;
- yMax1 = line->yMax;
- break;
- case 3:
- xMin1 = line->xMin;
- xMax1 = line->xMax;
- yMin1 = line->edge[normAfterEnd];
- yMax1 = line->edge[normStart];
- break;
+ if (nextline)
+ normAfterEnd = line->normalized_idx[j + k - n];
+ else
+ normAfterEnd = line->normalized_idx[j + len - 1] + 1;
}
+
+ adjustRotation(line, normStart, normAfterEnd, &xMin1, &xMax1, &yMin1, &yMax1);
+
if (backward) {
if ((startAtTop || yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) && (stopAtBottom || yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
if (!found || yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
@@ -4006,6 +4106,18 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
yMin0 = yMin1;
yMax0 = yMax1;
found = true;
+ if (nextLineMatch) { // set the out parameters
+ if (ignoredHyphen)
+ *ignoredHyphen = nextlineAfterHyphen;
+
+ if (continueMatch) {
+ adjustRotation(line->next, 0, n, &xMin2, &xMax2, &yMin2, &yMax2);
+ continueMatch->x1 = xMin2;
+ continueMatch->y1 = yMax2;
+ continueMatch->x2 = xMax2;
+ continueMatch->y2 = yMin2;
+ }
+ }
}
}
}
@@ -4019,6 +4131,10 @@ bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtB
++p;
}
}
+
+ if (nextline && nextline != line->next->ascii_translation && nextline != line->next->normalized) {
+ gfree(nextline);
+ }
}
}
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 2c39c67d..9e2d8275 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -613,6 +613,20 @@ public:
bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
double *yMax);
+ // Adds new parameter <matchAcrossLines>, which allows <s> to match on text
+ // spanning from end of a line to the next line. In that case, the rect for
+ // the part of match that falls on the next line will be stored in
+ // <continueMatch>, and if hyphenation (i.e. ignoring hyphen at end of line)
+ // was used while matching at the end of the line prior to <continueMatch>,
+ // then <ignoredHyphen> will be true, otherwise will be false.
+ // Only finding across two lines is supported, i.e. it won't match where <s>
+ // spans more than two lines.
+ //
+ // <matchAcrossLines> will be ignored if <backward> is true (as that
+ // combination has not been implemented yet).
+ bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin, double *yMin,
+ double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen);
+
// Get the text which is inside the specified rectangle.
GooString *getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const;
@@ -656,6 +670,7 @@ private:
void clear();
void assignColumns(TextLineFrag *frags, int nFrags, bool rot) const;
int dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const;
+ void adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax);
bool rawOrder; // keep text in content stream order
bool discardDiag; // discard diagonal text
diff --git a/qt5/src/poppler-page-private.h b/qt5/src/poppler-page-private.h
index e1312d44..6f1e668e 100644
--- a/qt5/src/poppler-page-private.h
+++ b/qt5/src/poppler-page-private.h
@@ -49,8 +49,8 @@ public:
static Link *convertLinkActionToLink(::LinkAction *a, DocumentData *parentDoc, const QRectF &linkArea);
TextPage *prepareTextSearch(const QString &text, Page::Rotation rotate, QVector<Unicode> *u);
- bool performSingleTextSearch(TextPage *textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics);
- QList<QRectF> performMultipleTextSearch(TextPage *textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics);
+ bool performSingleTextSearch(TextPage *textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics, bool sAcrossLines);
+ QList<QRectF> performMultipleTextSearch(TextPage *textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics, bool sAcrossLines);
};
}
diff --git a/qt5/src/poppler-page.cc b/qt5/src/poppler-page.cc
index c433ada4..bae438b7 100644
--- a/qt5/src/poppler-page.cc
+++ b/qt5/src/poppler-page.cc
@@ -50,6 +50,7 @@
#include <QtGui/QPainter>
#include <config.h>
+#include <cfloat>
#include <poppler-config.h>
#include <PDFDoc.h>
#include <Catalog.h>
@@ -359,24 +360,28 @@ inline TextPage *PageData::prepareTextSearch(const QString &text, Page::Rotation
return textPage;
}
-inline bool PageData::performSingleTextSearch(TextPage *textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics = false)
+inline bool PageData::performSingleTextSearch(TextPage *textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics,
+ bool sAcrossLines)
{
if (direction == Page::FromTop)
- return textPage->findText(u.data(), u.size(), true, true, false, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom);
+ return textPage->findText(u.data(), u.size(), true, true, false, false, sCase, sDiacritics, sAcrossLines, false, sWords, &sLeft, &sTop, &sRight, &sBottom, nullptr, nullptr);
else if (direction == Page::NextResult)
- return textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom);
+ return textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, sAcrossLines, false, sWords, &sLeft, &sTop, &sRight, &sBottom, nullptr, nullptr);
else if (direction == Page::PreviousResult)
- return textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, true, sWords, &sLeft, &sTop, &sRight, &sBottom);
+ return textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, sAcrossLines, true, sWords, &sLeft, &sTop, &sRight, &sBottom, nullptr, nullptr);
return false;
}
-inline QList<QRectF> PageData::performMultipleTextSearch(TextPage *textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics = false)
+inline QList<QRectF> PageData::performMultipleTextSearch(TextPage *textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics, bool sAcrossLines)
{
QList<QRectF> results;
double sLeft = 0.0, sTop = 0.0, sRight = 0.0, sBottom = 0.0;
+ bool sIgnoredHyphen = false;
+ PDFRectangle continueMatch;
+ continueMatch.x1 = DBL_MAX; // we use this to detect valid return values
- while (textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom)) {
+ while (textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, sAcrossLines, false, sWords, &sLeft, &sTop, &sRight, &sBottom, &continueMatch, &sIgnoredHyphen)) {
QRectF result;
result.setLeft(sLeft);
@@ -385,6 +390,18 @@ inline QList<QRectF> PageData::performMultipleTextSearch(TextPage *textPage, QVe
result.setBottom(sBottom);
results.append(result);
+
+ if (sAcrossLines && continueMatch.x1 != DBL_MAX) {
+ QRectF resultN;
+
+ resultN.setLeft(continueMatch.x1);
+ resultN.setTop(continueMatch.y1);
+ resultN.setRight(continueMatch.x2);
+ resultN.setBottom(continueMatch.y1);
+
+ results.append(resultN);
+ continueMatch.x1 = DBL_MAX;
+ }
}
return results;
@@ -647,7 +664,7 @@ bool Page::search(const QString &text, double &sLeft, double &sTop, double &sRig
QVector<Unicode> u;
TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u);
- const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, false);
+ const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, false, false, false);
textPage->decRefCnt();
@@ -659,11 +676,12 @@ bool Page::search(const QString &text, double &sLeft, double &sTop, double &sRig
const bool sCase = flags.testFlag(IgnoreCase) ? false : true;
const bool sWords = flags.testFlag(WholeWords) ? true : false;
const bool sDiacritics = flags.testFlag(IgnoreDiacritics) ? true : false;
+ const bool sAcrossLines = flags.testFlag(AcrossLines) ? true : false;
QVector<Unicode> u;
TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u);
- const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, sWords, sDiacritics);
+ const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, sWords, sDiacritics, sAcrossLines);
textPage->decRefCnt();
@@ -677,7 +695,7 @@ QList<QRectF> Page::search(const QString &text, SearchMode caseSensitive, Rotati
QVector<Unicode> u;
TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u);
- const QList<QRectF> results = m_page->performMultipleTextSearch(textPage, u, sCase, false);
+ const QList<QRectF> results = m_page->performMultipleTextSearch(textPage, u, sCase, false, false, false);
textPage->decRefCnt();
@@ -689,11 +707,12 @@ QList<QRectF> Page::search(const QString &text, SearchFlags flags, Rotation rota
const bool sCase = flags.testFlag(IgnoreCase) ? false : true;
const bool sWords = flags.testFlag(WholeWords) ? true : false;
const bool sDiacritics = flags.testFlag(IgnoreDiacritics) ? true : false;
+ const bool sAcrossLines = flags.testFlag(AcrossLines) ? true : false;
QVector<Unicode> u;
TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u);
- const QList<QRectF> results = m_page->performMultipleTextSearch(textPage, u, sCase, sWords, sDiacritics);
+ const QList<QRectF> results = m_page->performMultipleTextSearch(textPage, u, sCase, sWords, sDiacritics, sAcrossLines);
textPage->decRefCnt();
diff --git a/qt5/src/poppler-qt5.h b/qt5/src/poppler-qt5.h
index 70d2566a..d2c55b3d 100644
--- a/qt5/src/poppler-qt5.h
+++ b/qt5/src/poppler-qt5.h
@@ -763,9 +763,12 @@ rather unexpected results.
NoSearchFlags = 0x00000000, ///< since 0.63
IgnoreCase = 0x00000001, ///< Case differences are ignored
WholeWords = 0x00000002, ///< Only whole words are matched
- IgnoreDiacritics = 0x00000004 ///< Diacritic differences (eg. accents, umlauts, diaeresis) are ignored. \since 0.73
- ///< This option will have no effect if the search term contains characters which
- ///< are not pure ascii.
+ IgnoreDiacritics = 0x00000004, ///< Diacritic differences (eg. accents, umlauts, diaeresis) are ignored. \since 0.73
+ ///< This option will have no effect if the search term contains characters which
+ ///< are not pure ascii.
+ AcrossLines = 0x00000008 ///< Allows to match on text spanning from end of a line to the next line.
+ ///< It won't match on text spanning more than two lines. Automatically ignores hyphen
+ ///< at end of line, and allows whitespace in search term to match on newline. \since 21.05.0
};
Q_DECLARE_FLAGS(SearchFlags, SearchFlag)
@@ -812,6 +815,9 @@ rather unexpected results.
/**
Returns a list of all occurrences of the specified text on the page.
+ if SearchFlags::AcrossLines is given in \param flags, then rects may just
+ be parts of the text itself if it's split between multiple lines.
+
\param text the text to search
\param flags the flags to consider during matching
\param rotate the rotation to apply for the search order
diff --git a/qt5/tests/check_search.cpp b/qt5/tests/check_search.cpp
index 56cb53fc..7b379ad0 100644
--- a/qt5/tests/check_search.cpp
+++ b/qt5/tests/check_search.cpp
@@ -2,12 +2,15 @@
#include <poppler-qt5.h>
+// clazy:excludeall=qstring-allocations
+
class TestSearch : public QObject
{
Q_OBJECT
public:
TestSearch(QObject *parent = nullptr) : QObject(parent) { }
private slots:
+ void testAcrossLinesSearch(); // leave it first
void bug7063();
void testNextAndPrevious();
void testWholeWordsOnly();
@@ -33,12 +36,12 @@ void TestSearch::bug7063()
QCOMPARE(page->search(QStringLiteral(u"latin1:"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), false);
- QCOMPARE(page->search(QString::fromUtf8("é"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("à"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("ç"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("search \"é\", \"à\" or \"ç\""), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("¥µ©"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("¥©"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), false); // clazy:exclude=qstring-allocations
+ QCOMPARE(page->search(QString::fromUtf8("é"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("à"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("ç"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("search \"é\", \"à\" or \"ç\""), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("¥µ©"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("¥©"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), false);
QCOMPARE(page->search(QStringLiteral(u"non-ascii:"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
@@ -47,12 +50,12 @@ void TestSearch::bug7063()
QCOMPARE(page->search(QStringLiteral(u"latin1:"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), false);
- QCOMPARE(page->search(QString::fromUtf8("é"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("à"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("ç"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("search \"é\", \"à\" or \"ç\""), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("¥µ©"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("¥©"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), false); // clazy:exclude=qstring-allocations
+ QCOMPARE(page->search(QString::fromUtf8("é"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("à"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("ç"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("search \"é\", \"à\" or \"ç\""), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("¥µ©"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), true);
+ QCOMPARE(page->search(QString::fromUtf8("¥©"), rectLeft, rectTop, rectRight, rectBottom, Poppler::Page::FromTop), false);
}
void TestSearch::testNextAndPrevious()
@@ -201,17 +204,17 @@ void TestSearch::testIgnoreDiacritics()
QCOMPARE(page->search(QStringLiteral("ciguena"), left, top, right, bottom, direction, mode0), false);
QCOMPARE(page->search(QStringLiteral("Ciguena"), left, top, right, bottom, direction, mode1), false);
QCOMPARE(page->search(QStringLiteral("ciguena"), left, top, right, bottom, direction, mode1), true);
- QCOMPARE(page->search(QString::fromUtf8("cigüeña"), left, top, right, bottom, direction, mode1), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("cigüena"), left, top, right, bottom, direction, mode1), false); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("Cigüeña"), left, top, right, bottom, direction, mode1), false); // clazy:exclude=qstring-allocations
+ QCOMPARE(page->search(QString::fromUtf8("cigüeña"), left, top, right, bottom, direction, mode1), true);
+ QCOMPARE(page->search(QString::fromUtf8("cigüena"), left, top, right, bottom, direction, mode1), false);
+ QCOMPARE(page->search(QString::fromUtf8("Cigüeña"), left, top, right, bottom, direction, mode1), false);
QCOMPARE(page->search(QStringLiteral("Ciguena"), left, top, right, bottom, direction, mode2), true);
QCOMPARE(page->search(QStringLiteral("ciguena"), left, top, right, bottom, direction, mode2), true);
QCOMPARE(page->search(QStringLiteral("Ciguena"), left, top, right, bottom, direction, mode3), true);
QCOMPARE(page->search(QStringLiteral("ciguena"), left, top, right, bottom, direction, mode3), true);
- QCOMPARE(page->search(QString::fromUtf8("cigüeña"), left, top, right, bottom, direction, mode4), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("Cigüeña"), left, top, right, bottom, direction, mode4), true); // clazy:exclude=qstring-allocations
- QCOMPARE(page->search(QString::fromUtf8("cigüena"), left, top, right, bottom, direction, mode4), false); // clazy:exclude=qstring-allocations
+ QCOMPARE(page->search(QString::fromUtf8("cigüeña"), left, top, right, bottom, direction, mode4), true);
+ QCOMPARE(page->search(QString::fromUtf8("Cigüeña"), left, top, right, bottom, direction, mode4), true);
+ QCOMPARE(page->search(QString::fromUtf8("cigüena"), left, top, right, bottom, direction, mode4), false);
QCOMPARE(page->search(QStringLiteral("Ciguena"), left, top, right, bottom, direction, mode4), false);
QCOMPARE(page->search(QStringLiteral("kopfe"), left, top, right, bottom, direction, mode2), true);
@@ -253,7 +256,7 @@ void TestSearch::testRussianSearch()
double l, t, r, b; // left, top, right, bottom
// In the searched page 5, these two words do exist: простой and Простой
- const QString str = QString::fromUtf8("простой"); // clazy:exclude=qstring-allocations
+ const QString str = QString::fromUtf8("простой");
QCOMPARE(page->search(str, l, t, r, b, direction, mode0), true);
QCOMPARE(page->search(str, l, t, r, b, direction, mode1), true);
QCOMPARE(page->search(str, l, t, r, b, direction, mode2), true);
@@ -272,12 +275,95 @@ void TestSearch::testDeseretSearch()
double l, t, r, b; // left, top, right, bottom
- const QString str = QString::fromUtf8("𐐐𐐯𐑊𐐬"); // clazy:exclude=qstring-allocations
+ const QString str = QString::fromUtf8("𐐐𐐯𐑊𐐬");
QCOMPARE(page->search(str, l, t, r, b, Poppler::Page::FromTop, Poppler::Page::NoSearchFlags), true);
- const QString str2 = QString::fromUtf8("𐐸𐐯𐑊𐐬"); // clazy:exclude=qstring-allocations
+ const QString str2 = QString::fromUtf8("𐐸𐐯𐑊𐐬");
QCOMPARE(page->search(str2, l, t, r, b, Poppler::Page::FromTop, Poppler::Page::IgnoreCase), true);
}
+void TestSearch::testAcrossLinesSearch()
+{
+ // Test for searching across lines with new flag Poppler::Page::AcrossLines
+ // and its automatic features like ignoring hyphen at end of line or allowing
+ // whitespace in the search term to match on newline character.
+ QScopedPointer<Poppler::Document> document(Poppler::Document::load(TESTDATADIR "/unittestcases/searchAcrossLines.pdf"));
+ QVERIFY(document);
+
+ QScopedPointer<Poppler::Page> page(document->page(1));
+ QVERIFY(page);
+
+ const Poppler::Page::SearchDirection direction = Poppler::Page::FromTop;
+
+ const Poppler::Page::SearchFlags empty = Poppler::Page::NoSearchFlags;
+ const Poppler::Page::SearchFlags mode0 = Poppler::Page::AcrossLines;
+ const Poppler::Page::SearchFlags mode1 = Poppler::Page::AcrossLines | Poppler::Page::IgnoreDiacritics;
+ const Poppler::Page::SearchFlags mode2 = Poppler::Page::AcrossLines | Poppler::Page::IgnoreDiacritics | Poppler::Page::IgnoreCase;
+ const Poppler::Page::SearchFlags mode2W = mode2 | Poppler::Page::WholeWords;
+
+ double l, t, r, b; // left, top, right, bottom
+
+ // In the searched page, each of "re-conocimiento" "PRUE-BA" "imáge-nes" happen split across lines
+ const QString str1 = QString::fromUtf8("reconocimiento");
+ const QString str2 = QString::fromUtf8("IMagenes");
+ // Test it cannot be found with empty search flags
+ QCOMPARE(page->search(str1, l, t, r, b, direction, empty), false);
+ // Test it is found with AcrossLines option
+ QCOMPARE(page->search(str1, l, t, r, b, direction, mode0), true);
+ // Test AcrossLines with IgnoreDiacritics and IgnoreCase options
+ QCOMPARE(page->search(str2, l, t, r, b, direction, mode0), false);
+ QCOMPARE(page->search(str2, l, t, r, b, direction, mode1), false);
+ QCOMPARE(page->search(str2, l, t, r, b, direction, mode2), true);
+ // Test with WholeWords too
+ QCOMPARE(page->search(str2, l, t, r, b, direction, mode2W), true);
+
+ // Now test that AcrossLines also allows whitespace in the search term to match on newline char.
+ // In the searched page, "podrá" ends a line and "acordar" starts the next line, so we
+ // now test we match it with "podrá acordar"
+ const QString str3 = QString::fromUtf8("podrá acordar,");
+ QCOMPARE(page->search(str3, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page->search(str3, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page->search(str3, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page->search(str3, l, t, r, b, direction, mode2W), true);
+ // now test it also works with IgnoreDiacritics and IgnoreCase
+ const QString str4 = QString::fromUtf8("PODRA acordar");
+ QCOMPARE(page->search(str4, l, t, r, b, direction, mode0), false);
+ QCOMPARE(page->search(str4, l, t, r, b, direction, mode1), false);
+ QCOMPARE(page->search(str4, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page->search(str4, l, t, r, b, direction, mode2W), false); // false as it lacks ending comma
+
+ // Now test that when a hyphen char in the search term matches a hyphen at end of line,
+ // then we don't automatically ignore it, but treat it as a normal char.
+ // In the searched page, "CC BY-NC-SA 4.0" is split across two lines on the second hyphen
+ const QString str5 = QString::fromUtf8("CC BY-NC-SA 4.0");
+ QScopedPointer<Poppler::Page> page0(document->page(0));
+ QVERIFY(page0);
+ QCOMPARE(page0->search(str5, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page0->search(str5, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page0->search(str5, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page0->search(str5, l, t, r, b, direction, mode2W), true);
+ QCOMPARE(page0->search(QString::fromUtf8("NC-SA"), l, t, r, b, direction, mode2W), false);
+ // Searching for "CC BY-NCSA 4.0" should also match, because hyphen is now ignored at end of line
+ const QString str6 = QString::fromUtf8("CC BY-NCSA 4.0");
+ QCOMPARE(page0->search(str6, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page0->search(str6, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page0->search(str6, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page0->search(str6, l, t, r, b, direction, mode2W), true);
+
+ // Now for completeness, we will match the full text of two lines
+ const QString full2lines = QString::fromUtf8("Las pruebas se practicarán en vista pública, si bien, excepcionalmente, el Tribunal podrá acordar, mediante providencia, que determinadas pruebas se celebren fuera del acto de juicio");
+ QCOMPARE(page->search(full2lines, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page->search(full2lines, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page->search(full2lines, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page->search(full2lines, l, t, r, b, direction, mode2W), true);
+ // And now the full text of two lines split by a hyphenated word
+ const QString full2linesHyphenated = QString::fromUtf8("Consiste básicamente en información digitalizada, codificados y alojados en un elemento contenedor digital (equipos, dispositivos periféricos, unidades de memoria, unidades "
+ "virtualizadas, tramas");
+ QCOMPARE(page->search(full2linesHyphenated, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page->search(full2linesHyphenated, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page->search(full2linesHyphenated, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page->search(full2linesHyphenated, l, t, r, b, direction, mode2W), true);
+}
+
QTEST_GUILESS_MAIN(TestSearch)
#include "check_search.moc"
diff --git a/qt6/src/poppler-page-private.h b/qt6/src/poppler-page-private.h
index e1312d44..6f1e668e 100644
--- a/qt6/src/poppler-page-private.h
+++ b/qt6/src/poppler-page-private.h
@@ -49,8 +49,8 @@ public:
static Link *convertLinkActionToLink(::LinkAction *a, DocumentData *parentDoc, const QRectF &linkArea);
TextPage *prepareTextSearch(const QString &text, Page::Rotation rotate, QVector<Unicode> *u);
- bool performSingleTextSearch(TextPage *textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics);
- QList<QRectF> performMultipleTextSearch(TextPage *textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics);
+ bool performSingleTextSearch(TextPage *textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics, bool sAcrossLines);
+ QList<QRectF> performMultipleTextSearch(TextPage *textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics, bool sAcrossLines);
};
}
diff --git a/qt6/src/poppler-page.cc b/qt6/src/poppler-page.cc
index f1f4116c..c561377d 100644
--- a/qt6/src/poppler-page.cc
+++ b/qt6/src/poppler-page.cc
@@ -50,6 +50,7 @@
#include <QtGui/QPainter>
#include <config.h>
+#include <cfloat>
#include <poppler-config.h>
#include <PDFDoc.h>
#include <Catalog.h>
@@ -359,24 +360,28 @@ inline TextPage *PageData::prepareTextSearch(const QString &text, Page::Rotation
return textPage;
}
-inline bool PageData::performSingleTextSearch(TextPage *textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics = false)
+inline bool PageData::performSingleTextSearch(TextPage *textPage, QVector<Unicode> &u, double &sLeft, double &sTop, double &sRight, double &sBottom, Page::SearchDirection direction, bool sCase, bool sWords, bool sDiacritics,
+ bool sAcrossLines)
{
if (direction == Page::FromTop)
- return textPage->findText(u.data(), u.size(), true, true, false, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom);
+ return textPage->findText(u.data(), u.size(), true, true, false, false, sCase, sDiacritics, sAcrossLines, false, sWords, &sLeft, &sTop, &sRight, &sBottom, nullptr, nullptr);
else if (direction == Page::NextResult)
- return textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom);
+ return textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, sAcrossLines, false, sWords, &sLeft, &sTop, &sRight, &sBottom, nullptr, nullptr);
else if (direction == Page::PreviousResult)
- return textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, true, sWords, &sLeft, &sTop, &sRight, &sBottom);
+ return textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, sAcrossLines, true, sWords, &sLeft, &sTop, &sRight, &sBottom, nullptr, nullptr);
return false;
}
-inline QList<QRectF> PageData::performMultipleTextSearch(TextPage *textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics = false)
+inline QList<QRectF> PageData::performMultipleTextSearch(TextPage *textPage, QVector<Unicode> &u, bool sCase, bool sWords, bool sDiacritics, bool sAcrossLines)
{
QList<QRectF> results;
double sLeft = 0.0, sTop = 0.0, sRight = 0.0, sBottom = 0.0;
+ bool sIgnoredHyphen = false;
+ PDFRectangle continueMatch;
+ continueMatch.x1 = DBL_MAX; // we use this to detect valid return values
- while (textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, false, sWords, &sLeft, &sTop, &sRight, &sBottom)) {
+ while (textPage->findText(u.data(), u.size(), false, true, true, false, sCase, sDiacritics, sAcrossLines, false, sWords, &sLeft, &sTop, &sRight, &sBottom, &continueMatch, &sIgnoredHyphen)) {
QRectF result;
result.setLeft(sLeft);
@@ -385,6 +390,18 @@ inline QList<QRectF> PageData::performMultipleTextSearch(TextPage *textPage, QVe
result.setBottom(sBottom);
results.append(result);
+
+ if (sAcrossLines && continueMatch.x1 != DBL_MAX) {
+ QRectF resultN;
+
+ resultN.setLeft(continueMatch.x1);
+ resultN.setTop(continueMatch.y1);
+ resultN.setRight(continueMatch.x2);
+ resultN.setBottom(continueMatch.y1);
+
+ results.append(resultN);
+ continueMatch.x1 = DBL_MAX;
+ }
}
return results;
@@ -645,11 +662,12 @@ bool Page::search(const QString &text, double &sLeft, double &sTop, double &sRig
const bool sCase = flags.testFlag(IgnoreCase) ? false : true;
const bool sWords = flags.testFlag(WholeWords) ? true : false;
const bool sDiacritics = flags.testFlag(IgnoreDiacritics) ? true : false;
+ const bool sAcrossLines = flags.testFlag(AcrossLines) ? true : false;
QVector<Unicode> u;
TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u);
- const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, sWords, sDiacritics);
+ const bool found = m_page->performSingleTextSearch(textPage, u, sLeft, sTop, sRight, sBottom, direction, sCase, sWords, sDiacritics, sAcrossLines);
textPage->decRefCnt();
@@ -661,11 +679,12 @@ QList<QRectF> Page::search(const QString &text, SearchFlags flags, Rotation rota
const bool sCase = flags.testFlag(IgnoreCase) ? false : true;
const bool sWords = flags.testFlag(WholeWords) ? true : false;
const bool sDiacritics = flags.testFlag(IgnoreDiacritics) ? true : false;
+ const bool sAcrossLines = flags.testFlag(AcrossLines) ? true : false;
QVector<Unicode> u;
TextPage *textPage = m_page->prepareTextSearch(text, rotate, &u);
- const QList<QRectF> results = m_page->performMultipleTextSearch(textPage, u, sCase, sWords, sDiacritics);
+ const QList<QRectF> results = m_page->performMultipleTextSearch(textPage, u, sCase, sWords, sDiacritics, sAcrossLines);
textPage->decRefCnt();
diff --git a/qt6/src/poppler-qt6.h b/qt6/src/poppler-qt6.h
index 5d38fbdd..c069e393 100644
--- a/qt6/src/poppler-qt6.h
+++ b/qt6/src/poppler-qt6.h
@@ -730,9 +730,12 @@ rather unexpected results.
NoSearchFlags = 0x00000000,
IgnoreCase = 0x00000001, ///< Case differences are ignored
WholeWords = 0x00000002, ///< Only whole words are matched
- IgnoreDiacritics = 0x00000004 ///< Diacritic differences (eg. accents, umlauts, diaeresis) are ignored.
- ///< This option will have no effect if the search term contains characters which
- ///< are not pure ascii.
+ IgnoreDiacritics = 0x00000004, ///< Diacritic differences (eg. accents, umlauts, diaeresis) are ignored.
+ ///< This option will have no effect if the search term contains characters which
+ ///< are not pure ascii.
+ AcrossLines = 0x00000008 ///< Allows to match on text spanning from end of a line to the next line.
+ ///< It won't match on text spanning more than two lines. Automatically ignores hyphen
+ ///< at end of line, and allows whitespace in search term to match on newline. \since 21.05.0
};
Q_DECLARE_FLAGS(SearchFlags, SearchFlag)
@@ -751,6 +754,9 @@ rather unexpected results.
/**
Returns a list of all occurrences of the specified text on the page.
+ if SearchFlags::AcrossLines is given in \param flags, then rects may just
+ be parts of the text itself if it's split between multiple lines.
+
\param text the text to search
\param flags the flags to consider during matching
\param rotate the rotation to apply for the search order
diff --git a/qt6/tests/check_search.cpp b/qt6/tests/check_search.cpp
index c2ba3993..d8534975 100644
--- a/qt6/tests/check_search.cpp
+++ b/qt6/tests/check_search.cpp
@@ -8,6 +8,7 @@ class TestSearch : public QObject
public:
TestSearch(QObject *parent = nullptr) : QObject(parent) { }
private slots:
+ void testAcrossLinesSearch(); // leave it first
void bug7063();
void testNextAndPrevious();
void testWholeWordsOnly();
@@ -279,5 +280,89 @@ void TestSearch::testDeseretSearch()
QCOMPARE(page->search(str2, l, t, r, b, Poppler::Page::FromTop, Poppler::Page::IgnoreCase), true);
}
+void TestSearch::testAcrossLinesSearch()
+{
+ // Test for searching across lines with new flag Poppler::Page::AcrossLines
+ // and its automatic features like ignoring hyphen at end of line or allowing
+ // whitespace in the search term to match on newline character.
+ QScopedPointer<Poppler::Document> document(Poppler::Document::load(TESTDATADIR "/unittestcases/searchAcrossLines.pdf"));
+ QVERIFY(document);
+
+ QScopedPointer<Poppler::Page> page(document->page(1));
+ QVERIFY(page);
+
+ const Poppler::Page::SearchDirection direction = Poppler::Page::FromTop;
+
+ const Poppler::Page::SearchFlags empty = Poppler::Page::NoSearchFlags;
+ const Poppler::Page::SearchFlags mode0 = Poppler::Page::AcrossLines;
+ const Poppler::Page::SearchFlags mode1 = Poppler::Page::AcrossLines | Poppler::Page::IgnoreDiacritics;
+ const Poppler::Page::SearchFlags mode2 = Poppler::Page::AcrossLines | Poppler::Page::IgnoreDiacritics | Poppler::Page::IgnoreCase;
+ const Poppler::Page::SearchFlags mode2W = mode2 | Poppler::Page::WholeWords;
+
+ double l, t, r, b; // left, top, right, bottom
+
+ // In the searched page, each of "re-conocimiento" "PRUE-BA" "imáge-nes" happen split across lines
+ const QString str1 = QString::fromUtf8("reconocimiento"); // clazy:exclude=qstring-allocations
+ const QString str2 = QString::fromUtf8("IMagenes"); // clazy:exclude=qstring-allocations
+ // Test it cannot be found with empty search flags
+ QCOMPARE(page->search(str1, l, t, r, b, direction, empty), false);
+ // Test it is found with AcrossLines option
+ QCOMPARE(page->search(str1, l, t, r, b, direction, mode0), true);
+ // Test AcrossLines with IgnoreDiacritics and IgnoreCase options
+ QCOMPARE(page->search(str2, l, t, r, b, direction, mode0), false);
+ QCOMPARE(page->search(str2, l, t, r, b, direction, mode1), false);
+ QCOMPARE(page->search(str2, l, t, r, b, direction, mode2), true);
+ // Test with WholeWords too
+ QCOMPARE(page->search(str2, l, t, r, b, direction, mode2W), true);
+
+ // Now test that AcrossLines also allows whitespace in the search term to match on newline char.
+ // In the searched page, "podrá" ends a line and "acordar" starts the next line, so we
+ // now test we match it with "podrá acordar"
+ const QString str3 = QString::fromUtf8("podrá acordar,"); // clazy:exclude=qstring-allocations
+ QCOMPARE(page->search(str3, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page->search(str3, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page->search(str3, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page->search(str3, l, t, r, b, direction, mode2W), true);
+ // now test it also works with IgnoreDiacritics and IgnoreCase
+ const QString str4 = QString::fromUtf8("PODRA acordar"); // clazy:exclude=qstring-allocations
+ QCOMPARE(page->search(str4, l, t, r, b, direction, mode0), false);
+ QCOMPARE(page->search(str4, l, t, r, b, direction, mode1), false);
+ QCOMPARE(page->search(str4, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page->search(str4, l, t, r, b, direction, mode2W), false); // false as it lacks ending comma
+
+ // Now test that when a hyphen char in the search term matches a hyphen at end of line,
+ // then we don't automatically ignore it, but treat it as a normal char.
+ // In the searched page, "CC BY-NC-SA 4.0" is split across two lines on the second hyphen
+ const QString str5 = QString::fromUtf8("CC BY-NC-SA 4.0"); // clazy:exclude=qstring-allocations
+ QScopedPointer<Poppler::Page> page0(document->page(0));
+ QVERIFY(page0);
+ QCOMPARE(page0->search(str5, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page0->search(str5, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page0->search(str5, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page0->search(str5, l, t, r, b, direction, mode2W), true);
+ QCOMPARE(page0->search(QString::fromUtf8("NC-SA"), l, t, r, b, direction, mode2W), false); // clazy:exclude=qstring-allocations
+ // Searching for "CC BY-NCSA 4.0" should also match, because hyphen is now ignored at end of line
+ const QString str6 = QString::fromUtf8("CC BY-NCSA 4.0"); // clazy:exclude=qstring-allocations
+ QCOMPARE(page0->search(str6, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page0->search(str6, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page0->search(str6, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page0->search(str6, l, t, r, b, direction, mode2W), true);
+
+ // Now for completeness, we will match the full text of two lines
+ const QString full2lines = QString::fromUtf8(
+ "Las pruebas se practicarán en vista pública, si bien, excepcionalmente, el Tribunal podrá acordar, mediante providencia, que determinadas pruebas se celebren fuera del acto de juicio"); // clazy:exclude=qstring-allocations
+ QCOMPARE(page->search(full2lines, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page->search(full2lines, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page->search(full2lines, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page->search(full2lines, l, t, r, b, direction, mode2W), true);
+ // And now the full text of two lines split by a hyphenated word
+ const QString full2linesHyphenated = QString::fromUtf8("Consiste básicamente en información digitalizada, codificados y alojados en un elemento contenedor digital (equipos, dispositivos periféricos, unidades de memoria, unidades "
+ "virtualizadas, tramas"); // clazy:exclude=qstring-allocations
+ QCOMPARE(page->search(full2linesHyphenated, l, t, r, b, direction, mode0), true);
+ QCOMPARE(page->search(full2linesHyphenated, l, t, r, b, direction, mode1), true);
+ QCOMPARE(page->search(full2linesHyphenated, l, t, r, b, direction, mode2), true);
+ QCOMPARE(page->search(full2linesHyphenated, l, t, r, b, direction, mode2W), true);
+}
+
QTEST_GUILESS_MAIN(TestSearch)
#include "check_search.moc"
More information about the poppler
mailing list