[poppler] poppler/TextOutputDev.cc poppler/TextOutputDev.h
Albert Astals Cid
aacid at kemper.freedesktop.org
Sat Apr 4 09:41:38 PDT 2015
poppler/TextOutputDev.cc | 419 ++++++++++++++++++++++++++++++++++-------------
poppler/TextOutputDev.h | 31 +++
2 files changed, 336 insertions(+), 114 deletions(-)
New commits:
commit edc7bb4fe7b5e718431de12bee1d95036d06efa0
Author: Jason Crain <jason at aquaticape.us>
Date: Sat Apr 4 18:40:48 2015 +0200
Combine base characters and diacritical marks
LaTeX adds base characters and diacritical marks as separate
characters. When a base character and diacritical mark are drawn over
each other, this patch converts them into a combining character
sequence.
Bug #87215
C# poppler-0.31.0.tar.xz
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 150d444..16f099f 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -30,7 +30,7 @@
// Copyright (C) 2010 Suzuki Toshiya <mpsuzuki at hiroshima-u.ac.jp>
// Copyright (C) 2011 Sam Liao <phyomh at gmail.com>
// Copyright (C) 2012 Horst Prote <prote at fmi.uni-stuttgart.de>
-// Copyright (C) 2012, 2013, 2014 Jason Crain <jason at aquaticape.us>
+// Copyright (C) 2012, 2013-2015 Jason Crain <jason at aquaticape.us>
// Copyright (C) 2012 Peter Breitenlohner <peb at mppmu.mpg.de>
// Copyright (C) 2013 José Aliste <jaliste at src.gnome.org>
// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag at alfa.de>
@@ -173,6 +173,11 @@
// Max distance between edge of text and edge of link border
#define hyperlinkSlack 2
+// Max distance between characters when combining a base character and
+// combining character
+#define combMaxMidDelta 0.3
+#define combMaxBaseDelta 0.4
+
//------------------------------------------------------------------------
// TextUnderline
//------------------------------------------------------------------------
@@ -236,6 +241,18 @@ GBool TextFontInfo::matches(TextFontInfo *fontInfo) {
return gfxFont == fontInfo->gfxFont;
}
+double TextFontInfo::getAscent() {
+ return gfxFont ? gfxFont->getAscent() : 0.95;
+}
+
+double TextFontInfo::getDescent() {
+ return gfxFont ? gfxFont->getDescent() : -0.35;
+}
+
+int TextFontInfo::getWMode() {
+ return gfxFont ? gfxFont->getWMode() : 0;
+}
+
//------------------------------------------------------------------------
// TextWord
//------------------------------------------------------------------------
@@ -282,19 +299,7 @@ TextWord::~TextWord() {
void TextWord::addChar(GfxState *state, TextFontInfo *fontA, double x, double y,
double dx, double dy, int charPosA, int charLen,
CharCode c, Unicode u, Matrix textMatA) {
- GfxFont *gfxFont;
- double ascent, descent;
- ascent = descent = 0; // make gcc happy
-
- if (len == size) {
- size += 16;
- text = (Unicode *)greallocn(text, size, sizeof(Unicode));
- charcode = (Unicode *)greallocn(charcode, size, sizeof(CharCode));
- edge = (double *)greallocn(edge, (size + 1), sizeof(double));
- charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
- font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
- textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix));
- }
+ ensureCapacity(len+1);
text[len] = u;
charcode[len] = c;
charPos[len] = charPosA;
@@ -302,61 +307,26 @@ void TextWord::addChar(GfxState *state, TextFontInfo *fontA, double x, double y,
font[len] = fontA;
textMat[len] = textMatA;
- if (len == 0) {
- if ((gfxFont = fontA->gfxFont)) {
- ascent = gfxFont->getAscent() * fontSize;
- descent = gfxFont->getDescent() * fontSize;
- wMode = gfxFont->getWMode();
- } else {
- // this means that the PDF file draws text without a current font,
- // which should never happen
- ascent = 0.95 * fontSize;
- descent = -0.35 * fontSize;
- wMode = 0;
- }
- }
+ if (len == 0)
+ setInitialBounds(fontA, x, y);
if (wMode) { // vertical writing mode
// NB: the rotation value has been incremented by 1 (in
// TextPage::beginWord()) for vertical writing mode
switch (rot) {
case 0:
- if (len == 0) {
- xMin = x - fontSize;
- yMin = y - fontSize;
- yMax = y;
- base = y;
- }
edge[len] = x - fontSize;
xMax = edge[len+1] = x;
break;
case 1:
- if (len == 0) {
- xMin = x;
- yMin = y - fontSize;
- xMax = x + fontSize;
- base = x;
- }
edge[len] = y - fontSize;
yMax = edge[len+1] = y;
break;
case 2:
- if (len == 0) {
- yMin = y;
- xMax = x + fontSize;
- yMax = y + fontSize;
- base = y;
- }
edge[len] = x + fontSize;
xMin = edge[len+1] = x;
break;
case 3:
- if (len == 0) {
- xMin = x - fontSize;
- xMax = x;
- yMax = y + fontSize;
- base = x;
- }
edge[len] = y + fontSize;
yMin = edge[len+1] = y;
break;
@@ -364,66 +334,18 @@ void TextWord::addChar(GfxState *state, TextFontInfo *fontA, double x, double y,
} else { // horizontal writing mode
switch (rot) {
case 0:
- if (len == 0) {
- xMin = x;
- yMin = y - ascent;
- yMax = y - descent;
- if (yMin == yMax) {
- // this is a sanity check for a case that shouldn't happen -- but
- // if it does happen, we want to avoid dividing by zero later
- yMin = y;
- yMax = y + 1;
- }
- base = y;
- }
edge[len] = x;
xMax = edge[len+1] = x + dx;
break;
case 1:
- if (len == 0) {
- xMin = x + descent;
- yMin = y;
- xMax = x + ascent;
- if (xMin == xMax) {
- // this is a sanity check for a case that shouldn't happen -- but
- // if it does happen, we want to avoid dividing by zero later
- xMin = x;
- xMax = x + 1;
- }
- base = x;
- }
edge[len] = y;
yMax = edge[len+1] = y + dy;
break;
case 2:
- if (len == 0) {
- yMin = y + descent;
- xMax = x;
- yMax = y + ascent;
- if (yMin == yMax) {
- // this is a sanity check for a case that shouldn't happen -- but
- // if it does happen, we want to avoid dividing by zero later
- yMin = y;
- yMax = y + 1;
- }
- base = y;
- }
edge[len] = x;
xMin = edge[len+1] = x + dx;
break;
case 3:
- if (len == 0) {
- xMin = x - ascent;
- xMax = x - descent;
- yMax = y;
- if (xMin == xMax) {
- // this is a sanity check for a case that shouldn't happen -- but
- // if it does happen, we want to avoid dividing by zero later
- xMin = x;
- xMax = x + 1;
- }
- base = x;
- }
edge[len] = y;
yMin = edge[len+1] = y + dy;
break;
@@ -432,6 +354,269 @@ void TextWord::addChar(GfxState *state, TextFontInfo *fontA, double x, double y,
++len;
}
+void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y) {
+ double ascent = fontA->getAscent() * fontSize;
+ double descent = fontA->getDescent() * fontSize;
+ wMode = fontA->getWMode();
+
+ if (wMode) { // vertical writing mode
+ // NB: the rotation value has been incremented by 1 (in
+ // TextPage::beginWord()) for vertical writing mode
+ switch (rot) {
+ case 0:
+ xMin = x - fontSize;
+ yMin = y - fontSize;
+ yMax = y;
+ base = y;
+ break;
+ case 1:
+ xMin = x;
+ yMin = y - fontSize;
+ xMax = x + fontSize;
+ base = x;
+ break;
+ case 2:
+ yMin = y;
+ xMax = x + fontSize;
+ yMax = y + fontSize;
+ base = y;
+ break;
+ case 3:
+ xMin = x - fontSize;
+ xMax = x;
+ yMax = y + fontSize;
+ base = x;
+ break;
+ }
+ } else { // horizontal writing mode
+ switch (rot) {
+ case 0:
+ xMin = x;
+ yMin = y - ascent;
+ yMax = y - descent;
+ if (yMin == yMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ yMin = y;
+ yMax = y + 1;
+ }
+ base = y;
+ break;
+ case 1:
+ xMin = x + descent;
+ yMin = y;
+ xMax = x + ascent;
+ if (xMin == xMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ xMin = x;
+ xMax = x + 1;
+ }
+ base = x;
+ break;
+ case 2:
+ yMin = y + descent;
+ xMax = x;
+ yMax = y + ascent;
+ if (yMin == yMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ yMin = y;
+ yMax = y + 1;
+ }
+ base = y;
+ break;
+ case 3:
+ xMin = x - ascent;
+ xMax = x - descent;
+ yMax = y;
+ if (xMin == xMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ xMin = x;
+ xMax = x + 1;
+ }
+ base = x;
+ break;
+ }
+ }
+}
+
+void TextWord::ensureCapacity(int capacity) {
+ if (capacity > size) {
+ size = std::max(size + 16, capacity);
+ text = (Unicode *)greallocn(text, size, sizeof(Unicode));
+ charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
+ edge = (double *)greallocn(edge, (size + 1), sizeof(double));
+ charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
+ font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
+ textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix));
+ }
+}
+
+struct CombiningTable {
+ Unicode base;
+ Unicode comb;
+};
+
+static struct CombiningTable combiningTable[] = {
+ {0x0060, 0x0300}, // grave
+ {0x00a8, 0x0308}, // dieresis
+ {0x00af, 0x0304}, // macron
+ {0x00b4, 0x0301}, // acute
+ {0x00b8, 0x0327}, // cedilla
+ {0x02c6, 0x0302}, // circumflex
+ {0x02c7, 0x030c}, // caron
+ {0x02d8, 0x0306}, // breve
+ {0x02d9, 0x0307}, // dotaccent
+ {0x02da, 0x030a}, // ring
+ {0x02dc, 0x0303}, // tilde
+ {0x02dd, 0x030b} // hungarumlaut (double acute accent)
+};
+
+// returning combining versions of characters
+Unicode getCombiningChar(Unicode u) {
+ int len = sizeof(combiningTable) / sizeof(combiningTable[0]);
+ for (int i = 0; i < len; ++i) {
+ if (u == combiningTable[i].base)
+ return combiningTable[i].comb;
+ }
+ return 0;
+}
+
+GBool TextWord::addCombining(GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y,
+ double dx, double dy, int charPosA, int charLen,
+ CharCode c, Unicode u, Matrix textMatA) {
+ if (len == 0 || wMode != 0 || fontA->getWMode() != 0)
+ return gFalse;
+
+ Unicode cCurrent = getCombiningChar(u);
+ Unicode cPrev = getCombiningChar(text[len-1]);
+ double edgeMid = (edge[len-1] + edge[len]) / 2;
+ double charMid, maxScaledMidDelta, charBase, maxScaledBaseDelta;
+
+ if (cCurrent != 0 && unicodeTypeAlphaNum(text[len-1])) {
+ // Current is a combining character, previous is base character
+ maxScaledMidDelta = fabs(edge[len] - edge[len-1]) * combMaxMidDelta;
+ charMid = charBase = maxScaledBaseDelta = 0;
+
+ // Test if characters overlap
+ if (rot == 0 || rot == 2) {
+ charMid = x + (dx / 2);
+ charBase = y;
+ maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta;
+ } else {
+ charMid = y + (dy / 2);
+ charBase = x;
+ maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
+ }
+
+ if (fabs(charMid - edgeMid) >= maxScaledMidDelta ||
+ fabs(charBase - base) >= maxScaledBaseDelta)
+ return gFalse;
+
+ // Add character, but don't adjust edge / bounding box because
+ // combining character's positioning could be odd.
+ ensureCapacity(len+1);
+ text[len] = cCurrent;
+ charcode[len] = c;
+ charPos[len] = charPosA;
+ charPos[len+1] = charPosA + charLen;
+ font[len] = fontA;
+ textMat[len] = textMatA;
+ edge[len+1] = edge[len];
+ edge[len] = (edge[len+1] + edge[len-1]) / 2;
+ ++len;
+ return gTrue;
+ }
+
+ if (cPrev != 0 && unicodeTypeAlphaNum(u)) {
+ // Previous is a combining character, current is base character
+ maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
+ charMid = charBase = maxScaledMidDelta = 0;
+
+ // Test if characters overlap
+ if (rot == 0 || rot == 2) {
+ charMid = x + (dx / 2);
+ charBase = y;
+ maxScaledMidDelta = fabs(dx * combMaxMidDelta);
+ } else {
+ charMid = y + (dy / 2);
+ charBase = x;
+ maxScaledMidDelta = fabs(dy * combMaxMidDelta);
+ }
+
+ if (fabs(charMid - edgeMid) >= maxScaledMidDelta ||
+ fabs(charBase - base) >= maxScaledBaseDelta)
+ return gFalse;
+
+ // move combining character to after base character
+ ensureCapacity(len+1);
+ fontSize = fontSizeA;
+ text[len] = cPrev;
+ charcode[len] = charcode[len-1];
+ charPos[len] = charPosA;
+ charPos[len+1] = charPosA + charLen;
+ font[len] = font[len-1];
+ textMat[len] = textMat[len-1];
+
+ text[len-1] = u;
+ charcode[len-1] = c;
+ font[len-1] = fontA;
+ textMat[len-1] = textMatA;
+
+ if (len == 1)
+ setInitialBounds(fontA, x, y);
+
+ // Updated edges / bounding box because we changed the base
+ // character.
+ if (wMode) {
+ switch (rot) {
+ case 0:
+ edge[len-1] = x - fontSize;
+ xMax = edge[len+1] = x;
+ break;
+ case 1:
+ edge[len-1] = y - fontSize;
+ yMax = edge[len+1] = y;
+ break;
+ case 2:
+ edge[len-1] = x + fontSize;
+ xMin = edge[len+1] = x;
+ break;
+ case 3:
+ edge[len-1] = y + fontSize;
+ yMin = edge[len+1] = y;
+ break;
+ }
+ } else {
+ switch (rot) {
+ case 0:
+ edge[len-1] = x;
+ xMax = edge[len+1] = x + dx;
+ break;
+ case 1:
+ edge[len-1] = y;
+ yMax = edge[len+1] = y + dy;
+ break;
+ case 2:
+ edge[len-1] = x;
+ xMin = edge[len+1] = x + dx;
+ break;
+ case 3:
+ edge[len-1] = y;
+ yMin = edge[len+1] = y + dy;
+ break;
+ }
+ }
+
+ edge[len] = (edge[len+1] + edge[len-1]) / 2;
+ ++len;
+ return gTrue;
+ }
+ return gFalse;
+}
+
void TextWord::merge(TextWord *word) {
int i;
@@ -447,15 +632,7 @@ void TextWord::merge(TextWord *word) {
if (word->yMax > yMax) {
yMax = word->yMax;
}
- if (len + word->len > size) {
- size = len + word->len;
- text = (Unicode *)greallocn(text, size, sizeof(Unicode));
- charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
- edge = (double *)greallocn(edge, (size + 1), sizeof(double));
- charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
- font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
- textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix));
- }
+ ensureCapacity(len + word->len);
for (i = 0; i < word->len; ++i) {
text[len + i] = word->text[i];
charcode[len + i] = word->charcode[i];
@@ -2070,6 +2247,7 @@ TextPage::TextPage(GBool rawOrderA) {
haveLastFind = gFalse;
underlines = new GooList();
links = new GooList();
+ mergeCombining = gTrue;
}
TextPage::~TextPage() {
@@ -2320,6 +2498,17 @@ void TextPage::addChar(GfxState *state, double x, double y,
return;
}
+ state->getFontTransMat(&mat.m[0], &mat.m[1], &mat.m[2], &mat.m[3]);
+ mat.m[4] = x1;
+ mat.m[5] = y1;
+
+ if (mergeCombining && curWord && uLen == 1 &&
+ curWord->addCombining(state, curFont, curFontSize, x1, y1, w1, h1, charPos, nBytes, c,
+ u[0], mat)) {
+ charPos += nBytes;
+ return;
+ }
+
// start a new word if:
// (1) this character doesn't fall in the right place relative to
// the end of the previous word (this places upper and lower
@@ -2356,7 +2545,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
}
overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
- wMode = curFont->gfxFont ? curFont->gfxFont->getWMode() : 0;
+ wMode = curFont->getWMode();
if (overlap || lastCharOverlap ||
sp < -minDupBreakOverlap * curWord->fontSize ||
sp > minWordBreakSpace * curWord->fontSize ||
@@ -2376,10 +2565,6 @@ void TextPage::addChar(GfxState *state, double x, double y,
beginWord(state);
}
- state->getFontTransMat(&mat.m[0], &mat.m[1], &mat.m[2], &mat.m[3]);
- mat.m[4] = x1;
- mat.m[5] = y1;
-
// page rotation and/or transform matrices can cause text to be
// drawn in reverse order -- in this case, swap the begin/end
// coordinates and break text into individual chars
@@ -5043,6 +5228,10 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
uMap->decRefCnt();
}
+void TextPage::setMergeCombining(GBool merge) {
+ mergeCombining = merge;
+}
+
void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
TextLineFrag *frag0, *frag1;
int rot, col1, col2, i, j, k;
@@ -5610,6 +5799,10 @@ GBool TextOutputDev::findCharRange(int pos, int length,
return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
}
+void TextOutputDev::setMergeCombining(GBool merge) {
+ text->setMergeCombining(merge);
+}
+
#if TEXTOUT_WORD_LIST
TextWordList *TextOutputDev::makeWordList() {
return text->makeWordList(physLayout);
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 23fb3b7..a0aa6f8 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -19,7 +19,7 @@
// Copyright (C) 2007 Adrian Johnson <ajohnson at redneon.com>
// Copyright (C) 2008, 2010 Albert Astals Cid <aacid at kde.org>
// Copyright (C) 2010 Brian Ewins <brian.ewins at gmail.com>
-// Copyright (C) 2012, 2013 Jason Crain <jason at aquaticape.us>
+// Copyright (C) 2012, 2013, 2015 Jason Crain <jason at aquaticape.us>
// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag at alfa.de>
//
// To see a description of the changes please see the Changelog file that
@@ -82,6 +82,15 @@ public:
GBool matches(GfxState *state);
GBool matches(TextFontInfo *fontInfo);
+ // Get the font ascent, or a default value if the font is not set
+ double getAscent();
+
+ // Get the font descent, or a default value if the font is not set
+ double getDescent();
+
+ // Get the writing mode (0 or 1), or 0 if the font is not set
+ int getWMode();
+
#if TEXTOUT_WORD_LIST
// Get the font name (which may be NULL).
GooString *getFontName() { return fontName; }
@@ -125,6 +134,14 @@ public:
double dx, double dy, int charPosA, int charLen,
CharCode c, Unicode u, Matrix textMatA);
+ // Attempt to add a character to the word as a combining character.
+ // Either character u or the last character in the word must be an
+ // acute, dieresis, or other combining character. Returns true if
+ // the character was added.
+ GBool addCombining(GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y,
+ double dx, double dy, int charPosA, int charLen,
+ CharCode c, Unicode u, Matrix textMatA);
+
// Merge <word> onto the end of <this>.
void merge(TextWord *word);
@@ -172,6 +189,8 @@ public:
GBool hasSpaceAfter () { return spaceAfter; }
TextWord* nextWord () { return next; };
private:
+ void ensureCapacity(int capacity);
+ void setInitialBounds(TextFontInfo *fontA, double x, double y);
int rot; // rotation, multiple of 90 degrees
// (0, 1, 2, or 3)
@@ -601,6 +620,10 @@ public:
// Get the head of the linked list of TextFlows.
TextFlow *getFlows() { return flows; }
+ // If true, will combine characters when a base and combining
+ // character are drawn on eachother.
+ void setMergeCombining(GBool merge);
+
#if TEXTOUT_WORD_LIST
// Build a flat word list, in content stream order (if
// this->rawOrder is true), physical layout order (if <physLayout>
@@ -619,6 +642,8 @@ private:
int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
GBool rawOrder; // keep text in content stream order
+ GBool mergeCombining; // merge when combining and base characters
+ // are drawn on top of each other
double pageWidth, pageHeight; // width and height of current page
TextWord *curWord; // currently active string
@@ -814,6 +839,10 @@ public:
GooString *getSelectionText(PDFRectangle *selection,
SelectionStyle style);
+ // If true, will combine characters when a base and combining
+ // character are drawn on eachother.
+ void setMergeCombining(GBool merge);
+
#if TEXTOUT_WORD_LIST
// Build a flat word list, in content stream order (if
// this->rawOrder is true), physical layout order (if
More information about the poppler
mailing list