[poppler] pdftohtml patch: new reflow option (fixed)
Warren Toomey
poppler at tuhs.org
Tue Sep 30 23:49:35 PDT 2008
This patch add the -reflow option to pdftohtml which produces nicer
HTML output, and replaces the broken patch that I submitted as
http://lists.freedesktop.org/archives/poppler/2008-September/004115.html
The patch has no significant effect on -complex and -xml mode, and -nomerge
still works. The patch also fixes this bugzilla bug for poppler:
https://bugs.freedesktop.org/show_bug.cgi?id=12522
Thanks,
Warren
-------------- next part --------------
--- HtmlOutputDev.cc Wed Oct 1 15:38:03 2008
+++ HtmlOutputDev.cc Wed Oct 1 15:38:14 2008
@@ -53,6 +53,7 @@
extern GBool xml;
extern GBool showHidden;
extern GBool noMerge;
+extern GBool reFlow;
static GooString* basename(GooString* str){
@@ -381,10 +382,13 @@
void HtmlPage::coalesce() {
HtmlString *str1, *str2;
HtmlFont *hfont1, *hfont2;
- double space, horSpace, vertSpace, vertOverlap;
- GBool addSpace, addLineBreak;
+ double space, horSpace, vertSpace;
+ GBool addSpace;
+ GBool nextLine; // is str2 on the next line below?
+ GBool addNewline; // should we output a newline?
int n, i;
double curX, curY;
+ double lineStartX=0.0; // x-value of last line start
#if 0 //~ for debugging
for (str1 = yxStrings; str1; str1 = str1->yxNext) {
@@ -452,51 +456,65 @@
str1->htext->insert(0, ls);
delete ls;
}
- curX = str1->xMin; curY = str1->yMin;
+ lineStartX= curX = str1->xMin; curY = str1->yMin;
while (str1 && (str2 = str1->yxNext)) {
hfont2 = getFont(str2);
space = str1->yMax - str1->yMin;
horSpace = str2->xMin - str1->xMax;
- addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4);
+ // Determine if str2 is on the line below the current line
+ addNewline=nextLine = !noMerge && (fabs(str2->yMin - str1->yMin) > 5.0);
vertSpace = str2->yMin - str1->yMax;
+ // Heuristic: if the last character in str1 is a hyphen,
+ // turn off addNewline. This will "glue" hyphenated words
+ // that have been split over multiple lines.
+ if (reFlow && str1->text[str1->len -1] == '-') {
+ addNewline=0;
+ // Also remove the hyphen
+ str1->len--;
+ str1->htext->del(str1->htext->getLength() - 1, 1);
+ }
+
//printf("coalesce %d %d %f? ", str1->dir, str2->dir, d);
- if (str2->yMin >= str1->yMin && str2->yMin <= str1->yMax)
- {
- vertOverlap = str1->yMax - str2->yMin;
- } else
- if (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax)
- {
- vertOverlap = str2->yMax - str1->yMin;
- } else
+ // Is str2 a new paragraph?
+ if (nextLine && (
+ // Is this an indented new line?
+ (str2->xMin > lineStartX + 3.0)
+ // Or is there a blank line between this and the last line?
+ || (vertSpace > 0.5 * space)
+ // Or it is XML output, so we always separate each line
+ || xml ))
{
- vertOverlap = 0;
- }
-
- if (
- (
- (
- (
- (rawOrder && vertOverlap > 0.5 * space)
- ||
- (!rawOrder && str2->yMin < str1->yMax)
- ) &&
- (horSpace > -0.5 * space && horSpace < space)
- ) ||
- (vertSpace >= 0 && vertSpace < 0.5 * space && addLineBreak)
- ) &&
- (!complexMode || (hfont1->isEqualIgnoreBold(*hfont2))) && // in complex mode fonts must be the same, in other modes fonts do not metter
- str1->dir == str2->dir // text direction the same
- )
- {
-// printf("yes\n");
+ // A new paragraph, so keep strings separate
+// printf("new paragraph\n");
+ GBool finish_a = str1->getLink() != NULL;
+ GBool finish_bold = hfont1->isBold();
+ GBool finish_italic = hfont1->isItalic();
+ CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
+
+ str1->xMin = curX; str1->yMin = curY;
+ str1 = str2;
+ curX = str1->xMin; curY = str1->yMin;
+ lineStartX= str1->xMin;
+ hfont1 = hfont2;
+ if( hfont1->isBold() )
+ str1->htext->insert(0,"<b>",3);
+ if( hfont1->isItalic() )
+ str1->htext->insert(0,"<i>",3);
+ if( str1->getLink() != NULL ) {
+ GooString *ls = str1->getLink()->getLinkStart();
+ str1->htext->insert(0, ls);
+ delete ls;
+ }
+ } else {
+// printf("same paragraph\n");
n = str1->len + str2->len;
if ((addSpace = horSpace > 0.1 * space)) {
++n;
}
- if (addLineBreak) {
+ if (nextLine) {
++n;
}
@@ -507,18 +525,21 @@
str1->size * sizeof(double));
if (addSpace) {
str1->text[str1->len] = 0x20;
- str1->htext->append(xml?" ":" ");
+ str1->htext->append((xml || reFlow) ? " " : " ");
str1->xRight[str1->len] = str2->xMin;
++str1->len;
}
- if (addLineBreak) {
- str1->text[str1->len] = '\n';
- str1->htext->append("<br>");
- str1->xRight[str1->len] = str2->xMin;
- ++str1->len;
+ if (nextLine) {
+ if (addNewline) {
+ str1->text[str1->len] = '\n';
+ str1->htext->append(reFlow ? "\n" : "<br>");
+ str1->xRight[str1->len] = str2->xMin;
+ ++str1->len;
+ }
str1->yMin = str2->yMin;
str1->yMax = str2->yMax;
str1->xMax = str2->xMax;
+ lineStartX= str2->xMin;
int fontLineSize = hfont1->getLineSize();
int curLineSize = (int)(vertSpace + space);
if( curLineSize != fontLineSize )
@@ -570,26 +591,6 @@
}
str1->yxNext = str2->yxNext;
delete str2;
- } else { // keep strings separate
-// printf("no\n");
- GBool finish_a = str1->getLink() != NULL;
- GBool finish_bold = hfont1->isBold();
- GBool finish_italic = hfont1->isItalic();
- CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
-
- str1->xMin = curX; str1->yMin = curY;
- str1 = str2;
- curX = str1->xMin; curY = str1->yMin;
- hfont1 = hfont2;
- if( hfont1->isBold() )
- str1->htext->insert(0,"<b>",3);
- if( hfont1->isItalic() )
- str1->htext->insert(0,"<i>",3);
- if( str1->getLink() != NULL ) {
- GooString *ls = str1->getLink()->getLinkStart();
- str1->htext->insert(0, ls);
- delete ls;
- }
}
}
str1->xMin = curX; str1->yMin = curY;
@@ -692,7 +693,7 @@
if( !noframes )
{
- fputs("</HEAD>\n<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n",pageFile);
+ fputs("</HEAD>\n<BODY vlink=\"blue\" link=\"blue\">\n",pageFile);
}
if( !ignore )
@@ -760,10 +761,10 @@
str=new GooString(tmp->htext);
fputs(str->getCString(),f);
delete str;
- fputs("<br>\n",f);
+ fputs(reFlow ? "<p>\n" : "<br>\n",f);
}
}
- fputs("<hr>\n",f);
+ if (!reFlow) fputs("<hr>\n",f);
}
}
@@ -997,7 +998,7 @@
dumpMetaVars(page);
fprintf(page,"</HEAD>\n");
- fprintf(page,"<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n");
+ fprintf(page,"<BODY vlink=\"blue\" link=\"blue\">\n");
}
}
ok = gTrue;
@@ -1444,11 +1445,11 @@
GooString *str=GooString::fromInt(page);
/* complex simple
frames file-4.html files.html#4
- noframes file.html#4 file.html#4
+ noframes #4 #4
*/
if (noframes)
{
- file->append(".html#");
+ file= new GooString("#");
file->append(str);
}
else
@@ -1566,7 +1567,7 @@
if (noframes)
{
output = page;
- fputs("<hr>\n", output);
+ if (!reFlow) fputs("<hr>\n", output);
}
else
{
@@ -1583,7 +1584,7 @@
GBool done = newOutlineLevel(output, outlines, catalog);
if (done && !complexMode)
- fputs("<hr>\n", output);
+ if (!reFlow) fputs("<hr>\n", output);
if (bClose)
{
--- pdftohtml.cc 2008/09/30 00:18:37 1.1
+++ pdftohtml.cc 2008/10/01 05:47:12
@@ -54,6 +54,7 @@
GBool noDrm=gFalse;
GBool showHidden = gFalse;
+GBool reFlow = gFalse; // Output "reflow" paragraphs
GBool noMerge = gFalse;
static char ownerPassword[33] = "";
static char userPassword[33] = "";
@@ -92,12 +93,14 @@
"zoom the pdf document (default 1.5)"},
{"-xml", argFlag, &xml, 0,
"output for XML post-processing"},
+ {"-reflow", argFlag, &reFlow, 0,
+ "output reflow paragraphs"},
{"-hidden", argFlag, &showHidden, 0,
"output hidden text"},
{"-nomerge", argFlag, &noMerge, 0,
"do not merge paragraphs"},
{"-enc", argString, textEncName, sizeof(textEncName),
- "output text encoding name"},
+ "output text encoding name (UTF-8, Latin1 etc)"},
{"-dev", argString, gsDevice, sizeof(gsDevice),
"output device name for Ghostscript (png16m, jpeg etc)"},
{"-v", argFlag, &printVersion, 0,
@@ -234,7 +237,7 @@
{
complexMode = gTrue;
noframes = gTrue;
- noMerge = gTrue;
+ noMerge = gFalse;
}
// get page range
--- pdftohtml.1 2008/09/30 00:18:37 1.1
+++ pdftohtml.1 2008/10/01 05:42:30
@@ -52,11 +52,16 @@
.B \-zoom <fp>
zoom the pdf document (default 1.5)
.TP
+.B \-reflow
+join paragraph lines together and separate paragraphs with a <p> tag. With
+this flag off, paragraph lines are separated by <br> tags and paragraphs are
+also separated by <br> tags.
+.TP
.B \-xml
output for XML post-processing
.TP
.B \-enc <string>
-output text encoding name
+output text encoding name (UTF-8, Latin1 etc)
.TP
.B \-opw <string>
owner password (for encrypted files)
@@ -71,7 +76,7 @@
output device name for Ghostscript (png16m, jpeg etc)
.TP
.B \-nomerge
-do not merge paragraphs
+separate output HTML lines with newline characters
.TP
.B \-nodrm
override document DRM settings
More information about the poppler
mailing list