[poppler] [PATCH] Fixup LaTeX composed characters
Tim Brody
tdb2 at ecs.soton.ac.uk
Fri Mar 25 07:54:59 PDT 2011
Patch attached without the off-by-1 error in the malloc (+added some more
comments).
On Fri, 25 Mar 2011 13:28:05 +0000, Tim Brody <tdb2 at ecs.soton.ac.uk> wrote:
> Hi All,
>
> Attached is a patch to address the previous problem I wrote about with
> pdflatex-produced PDFs that contain overlapping-diacritics/accents.
>
> This patch contains:
> - a table of diacritic to Unicode combining character code-points
> - if an overlapping character is detected checks whether the first (in
> stream-sequence) character is in the table
> - pops the diacritic off the word
> - appends the diacritic to the character as a Unicode combining
character
>
> This does not fix \b{o} or \d{o} because TeX places them on the next line
> (so aren't detected as overlapping).
>
> Yes, this is an issue with pdflatex but there are 100,000s of
TeX-produced
> PDFs for which we don't have source for ...
--
All the best,
Tim.
-------------- next part --------------
From d3e7c2fcf620910980cc3529657cc3b88c8cf2e5 Mon Sep 17 00:00:00 2001
From: Tim Brody <tdb2 at ecs.soton.ac.uk>
Date: Fri, 25 Mar 2011 13:02:18 +0000
Subject: [PATCH] Turn TeX-style composed characters into Unicode combining characters during text conversion.
---
poppler/TextOutputDev.cc | 38 ++++++++++++++++++++++----
poppler/UnicodeCompEquivTables.h | 54 ++++++++++++++++++++++++++++++++++++++
poppler/UnicodeTypeTable.cc | 23 ++++++++++++++++
poppler/UnicodeTypeTable.h | 2 +
4 files changed, 111 insertions(+), 6 deletions(-)
create mode 100644 poppler/UnicodeCompEquivTables.h
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 13c67c6..2191fd1 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -2161,6 +2161,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
GBool overlap;
int i;
+ Unicode *uc = NULL; // u + combining character
// subtract char and word spacing from the dx,dy values
sp = state->getCharSpace();
@@ -2236,12 +2237,34 @@ void TextPage::addChar(GfxState *state, double x, double y,
}
overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
- if (overlap || lastCharOverlap ||
- sp < -minDupBreakOverlap * curWord->fontSize ||
- sp > minWordBreakSpace * curWord->fontSize ||
- fabs(base - curWord->base) > 0.5 ||
- curFontSize != curWord->fontSize) {
- endWord();
+ if (
+ // place overlapping characters in their own word
+ lastCharOverlap ||
+ // whitespace along main axis
+ sp > minWordBreakSpace * curWord->fontSize ||
+ // whitespace along secondary-axis
+ fabs(base - curWord->base) > 0.5 ||
+ // font size changed
+ curFontSize != curWord->fontSize
+ ) {
+ endWord();
+ }
+ // overlapping characters
+ else if (overlap || sp < -minDupBreakOverlap * curWord->fontSize ) {
+ // "u => ü, as seen in pdflatex output
+ Unicode uu;
+ if (unicodeCombineEquiv (curWord->text[curWord->len - 1], &uu)) {
+ curWord->len--;
+ curWord->charLen--;
+ uc = (Unicode *) gmallocn (uLen+1, sizeof (Unicode));
+ memcpy (uc, u, uLen * sizeof (Unicode));
+ uc[uLen++] = uu;
+ u = uc;
+ overlap = gFalse;
+ }
+ else {
+ endWord();
+ }
}
lastCharOverlap = overlap;
} else {
@@ -2293,6 +2316,9 @@ void TextPage::addChar(GfxState *state, double x, double y,
}
}
}
+ if (uc) {
+ gfree (uc);
+ }
if (curWord) {
curWord->charLen += nBytes;
}
diff --git a/poppler/UnicodeCompEquivTables.h b/poppler/UnicodeCompEquivTables.h
new file mode 100644
index 0000000..7b28ea1
--- /dev/null
+++ b/poppler/UnicodeCompEquivTables.h
@@ -0,0 +1,54 @@
+// Generated by combining.pl at Thu Mar 24 11:44:21 2011
+
+typedef struct {
+ Unicode character;
+ Unicode combining;
+} combine_equiv;
+
+#define COMBINE_EQUIV_TABLE_LENGTH 43
+
+static const combine_equiv combine_equiv_table[] = {
+ { 0x0022, 0x030e },
+ { 0x0027, 0x0301 },
+ { 0x005e, 0x0302 },
+ { 0x005f, 0x0332 },
+ { 0x0060, 0x0300 },
+ { 0x007e, 0x0303 },
+ { 0x00a8, 0x0308 },
+ { 0x00af, 0x0305 },
+ { 0x00b0, 0x030a },
+ { 0x00b4, 0x0301 },
+ { 0x00b8, 0x0327 },
+ { 0x02b1, 0x0324 },
+ { 0x02b2, 0x0321 },
+ { 0x02b7, 0x032b },
+ { 0x02b9, 0x0301 },
+ { 0x02ba, 0x030b },
+ { 0x02bb, 0x0312 },
+ { 0x02bc, 0x0315 },
+ { 0x02bd, 0x0314 },
+ { 0x02c0, 0x0309 },
+ { 0x02c6, 0x0302 },
+ { 0x02c7, 0x030c },
+ { 0x02c8, 0x030d },
+ { 0x02c9, 0x0304 },
+ { 0x02ca, 0x0301 },
+ { 0x02cb, 0x0300 },
+ { 0x02cc, 0x0329 },
+ { 0x02cd, 0x0331 },
+ { 0x02d4, 0x0323 },
+ { 0x02d5, 0x031e },
+ { 0x02d6, 0x031f },
+ { 0x02d7, 0x0320 },
+ { 0x02d8, 0x0306 },
+ { 0x02d9, 0x0307 },
+ { 0x02da, 0x030a },
+ { 0x02db, 0x0328 },
+ { 0x02dc, 0x0303 },
+ { 0x02dd, 0x030b },
+ { 0x0384, 0x0301 },
+ { 0x0559, 0x0314 },
+ { 0x055a, 0x0313 },
+ { 0x0901, 0x0310 },
+ { 0x2017, 0x0333 },
+};
diff --git a/poppler/UnicodeTypeTable.cc b/poppler/UnicodeTypeTable.cc
index c0483a5..ab40f86 100644
--- a/poppler/UnicodeTypeTable.cc
+++ b/poppler/UnicodeTypeTable.cc
@@ -22,6 +22,7 @@
#include <stdlib.h>
#include "CharTypes.h"
#include "UnicodeTypeTable.h"
+#include "UnicodeCompEquivTables.h"
#include "goo/gmem.h"
struct UnicodeMapTableEntry {
@@ -1095,6 +1096,28 @@ static GBool combine(Unicode base, Unicode add, Unicode *out) {
(((v) - HANGUL_V_BASE) + (HANGUL_V_COUNT * ((l) - HANGUL_L_BASE)))))
#define HANGUL_COMPOSE_LV_T(lv, t) ((lv) + ((t) - HANGUL_T_BASE))
+// Returns gTrue if @in has a combining equivalent (placed in @out), otherwise
+// gFalse.
+GBool unicodeCombineEquiv(Unicode in, Unicode *out) {
+ int start = 0, end = COMBINE_EQUIV_TABLE_LENGTH;
+
+ while (gTrue) {
+ int midpoint = (start+end) / 2;
+ if (combine_equiv_table[midpoint].character == in) {
+ *out = combine_equiv_table[midpoint].combining;
+ return gTrue;
+ }
+ else if (start == midpoint)
+ break;
+ else if (in > combine_equiv_table[midpoint].character)
+ start = midpoint;
+ else
+ end = midpoint;
+ }
+
+ return gFalse;
+}
+
// Converts Unicode string @in of length @len to its normalization in form
// NFKC (compatibility decomposition + canonical composition). The length of
// the resulting Unicode string is returned in @out_len. If non-NULL, @indices
diff --git a/poppler/UnicodeTypeTable.h b/poppler/UnicodeTypeTable.h
index 939e916..cabe80e 100644
--- a/poppler/UnicodeTypeTable.h
+++ b/poppler/UnicodeTypeTable.h
@@ -28,6 +28,8 @@ extern GBool unicodeTypeR(Unicode c);
extern Unicode unicodeToUpper(Unicode c);
+extern GBool unicodeCombineEquiv(Unicode in, Unicode *out);
+
extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len,
int *out_len, int **offsets);
--
1.7.2.3
More information about the poppler
mailing list