[poppler] 2 commits - utils/HtmlOutputDev.cc

Mon Nov 1 10:17:55 UTC 2021

utils/HtmlOutputDev.cc |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

New commits:
commit 28a523d6485d3be3c2a606cc942c34536cd26b50
Author: Christopher Hasse <hasse.christopher at gmail.com>
Date:   Mon Sep 13 01:21:20 2021 -0500

    Update pdftohtml duplicate detection
    
    The delta values used now are the same as the ones used in
    pdftotext, which have proven to be much more reliable.
    Additionally the search range on the xaxis for duplicate strings has
    been increased, which seems to vastly improve the ability to find
    duplicates. This algorithm can now properly detect duplicates as shown
    in #321.

diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 1c2f26c3..9e832bc8 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -516,11 +516,11 @@ void HtmlPage::coalesce()
         bool found;
         while (str1) {
             double size = str1->yMax - str1->yMin;
-            double xLimit = str1->xMin + size * 0.175;
+            double xLimit = str1->xMin + size;
             found = false;
             for (str2 = str1, str3 = str1->yxNext; str3 && str3->xMin < xLimit; str2 = str3, str3 = str2->yxNext) {
                 if (str3->len == str1->len && !memcmp(str3->text, str1->text, str1->len * sizeof(Unicode)) && fabs(str3->yMin - str1->yMin) < size * 0.2 && fabs(str3->yMax - str1->yMax) < size * 0.2
-                    && fabs(str3->xMax - str1->xMax) < size * 0.175) {
+                    && fabs(str3->xMax - str1->xMax) < size * 0.1) {
                     found = true;
                     // printf("found duplicate!\n");
                     break;
commit 94448a433c8690cb782ca9783d22e411e8d80e8d
Author: Christopher Hasse <hasse.christopher at gmail.com>
Date:   Sun Sep 12 03:53:08 2021 -0500

    pdftohtml: Reduce sensitivity of duplicate detection
    
    fixes #1117
    
    In some fonts, strings such as "ll" or "ff" are placed close enough
    together to trigger duplicate detection in pdftohtml. This commit makes
    the detection algorithm less sensitive to reduce the false positives
    while still maintaining the original function of the code.
    
    Prior to this commit, if a character's `xMax` is less than 20% of its
    height away from the following character's `xMax`, it is treated as a
    duplicate and removed. This commit changes that value to 17.5%, which
    will reduce the number of false positives without introducing too many
    false negatives.

diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index d49ccf9e..1c2f26c3 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -516,11 +516,11 @@ void HtmlPage::coalesce()
         bool found;
         while (str1) {
             double size = str1->yMax - str1->yMin;
-            double xLimit = str1->xMin + size * 0.2;
+            double xLimit = str1->xMin + size * 0.175;
             found = false;
             for (str2 = str1, str3 = str1->yxNext; str3 && str3->xMin < xLimit; str2 = str3, str3 = str2->yxNext) {
                 if (str3->len == str1->len && !memcmp(str3->text, str1->text, str1->len * sizeof(Unicode)) && fabs(str3->yMin - str1->yMin) < size * 0.2 && fabs(str3->yMax - str1->yMax) < size * 0.2
-                    && fabs(str3->xMax - str1->xMax) < size * 0.2) {
+                    && fabs(str3->xMax - str1->xMax) < size * 0.175) {
                     found = true;
                     // printf("found duplicate!\n");
                     break;